## import dependencies libs

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder

## import source data

In [None]:
source_data = pd.read_csv('./data/otto/train.csv')

## try to pre-process source data

In [None]:
# select top 10000 samples
# source_data = source_data.head(10000)

# format target value to number
# source_data['target'] = source_data['target'].replace(regex='.*_',value='')

## random sample the source data

In [None]:
feature_values = source_data.drop(['id','target'],axis=1)
target_values = source_data['target']

random_under_sampler = RandomUnderSampler(random_state=0)
resample_feature_values, resample_target_values = random_under_sampler.fit_resample(feature_values,target_values)
# resample_feature_values, resample_target_values = feature_values, target_values

## convert the target value so that machine can process them more easily

In [None]:
label_encoder = LabelEncoder()
encod_resample_target_values= label_encoder.fit_transform(resample_target_values)

## split the resampled data into train data and test data

In [None]:
train_data, test_data, train_target, test_target = train_test_split(resample_feature_values, encod_resample_target_values, test_size=0.25,random_state=0)

## train model

In [None]:
random_forest_classifier = RandomForestClassifier(oob_score=True)
random_forest_classifier.fit(train_data,train_target)

## predict the test target value  

In [None]:
predict_test_target = random_forest_classifier.predict(test_data)

## score the model

In [None]:
random_forest_classifier.score(test_data,test_target)

## log loss value 

In [None]:
one_hot_encoder = OneHotEncoder(sparse_output=False)

one_hot_test_target= one_hot_encoder.fit_transform(test_target.reshape(-1,1))
one_hot_predict_test_target= one_hot_encoder.fit_transform(predict_test_target.reshape(-1,1))

loss_value = log_loss(y_true=one_hot_test_target, y_pred=one_hot_predict_test_target ,eps=1e-15, normalize=True)
loss_value

In [None]:
predict_proba_test_target = random_forest_classifier.predict_proba(test_data)
loss_value = log_loss(y_true=one_hot_test_target, y_pred=predict_proba_test_target ,eps=1e-15, normalize=True)
loss_value

In [None]:
predict_proba_result = pd.DataFrame(predict_proba_test_target)
predict_proba_result.columns = label_encoder.inverse_transform(predict_proba_result.columns.values)
predict_proba_result

## model optimization
### n_estimators, max_feature, max_depth, min_samples_leaf

In [None]:
# tune_parameters = range(5,200,10)
tune_parameters = range(1,30,1)

accuracy = np.zeros(len(tune_parameters))
error = np.zeros(len(tune_parameters))

for index, parameter in enumerate(tune_parameters):
    random_forest_classifier_optimization = RandomForestClassifier(n_estimators=175,
                           max_features=15,
                           max_depth=30,
                           min_samples_leaf=1,
                           oob_score=True,
                           random_state=0,
                           n_jobs=-1)
    random_forest_classifier_optimization.fit(train_data, train_target)
    accuracy[index] = random_forest_classifier_optimization.oob_score_

    predict_proba_test_data = random_forest_classifier_optimization.predict_proba(test_data)
    error[index] = log_loss(one_hot_test_target,predict_proba_test_data,eps=1e-15,normalize=True)
    
    print(error)

In [None]:
from matplotlib.axes import Axes

plt.figure(figsize=(10,10), dpi=100)

fig, axes = plt.subplots(nrows=2,ncols=1)
error_axes: Axes = axes[0]
accuracy_axes: Axes = axes[1]

error_axes.plot(tune_parameters, error)
error_axes.set_title('error - n_estimators')
error_axes.set_xlabel('n_estimators')
error_axes.set_ylabel('error')

error_axes.grid(True)

accuracy_axes.plot(tune_parameters, accuracy)
accuracy_axes.set_title('accuracy - n_estimators')
accuracy_axes.set_xlabel('n_estimators')
accuracy_axes.set_ylabel('accuracy')

error_axes.grid(True)

plt.show()

In [None]:
from matplotlib.axes import Axes

plt.figure(figsize=(10,10), dpi=100)

fig, axes = plt.subplots(nrows=2,ncols=1)
error_axes: Axes = axes[0]
accuracy_axes: Axes = axes[1]

error_axes.plot(tune_parameters, error)
error_axes.set_title('error - max_features')
error_axes.set_xlabel('max_features')
error_axes.set_ylabel('error')

error_axes.grid(True)

accuracy_axes.plot(tune_parameters, accuracy)
accuracy_axes.set_title('accuracy - max_features')
accuracy_axes.set_xlabel('max_features')
accuracy_axes.set_ylabel('accuracy')

error_axes.grid(True)

plt.show()

In [None]:
from matplotlib.axes import Axes

plt.figure(figsize=(10,10), dpi=100)

fig, axes = plt.subplots(nrows=2,ncols=1)
error_axes: Axes = axes[0]
accuracy_axes: Axes = axes[1]

error_axes.plot(tune_parameters, error)
error_axes.set_title('error - max_depth')
error_axes.set_xlabel('max_depth')
error_axes.set_ylabel('error')

error_axes.grid(True)

accuracy_axes.plot(tune_parameters, accuracy)
accuracy_axes.set_title('accuracy - max_depth')
accuracy_axes.set_xlabel('max_depth')
accuracy_axes.set_ylabel('accuracy')

error_axes.grid(True)

plt.show()

In [None]:
from matplotlib.axes import Axes

plt.figure(figsize=(10,10), dpi=100)

fig, axes = plt.subplots(nrows=2,ncols=1)
error_axes: Axes = axes[0]
accuracy_axes: Axes = axes[1]

error_axes.plot(tune_parameters, error)
error_axes.set_title('error - min_samples_leaf')
error_axes.set_xlabel('min_samples_leaf')
error_axes.set_ylabel('error')

error_axes.grid(True)

accuracy_axes.plot(tune_parameters, accuracy)
accuracy_axes.set_title('accuracy - min_samples_leaf')
accuracy_axes.set_xlabel('min_samples_leaf')
accuracy_axes.set_ylabel('accuracy')

error_axes.grid(True)

plt.show()

 RandomForestClassifier(n_estimators=175,
                        max_features=15,
                        max_depth=30,
                        min_samples_leaf=1)

In [None]:
final_random_forest_classifier= RandomForestClassifier(
    n_estimators=175,
    max_features=15,
    max_depth=30,
    min_samples_leaf=1,
    oob_score=True,
    random_state=0,
    n_jobs=-1
)

final_random_forest_classifier.fit(train_data,train_target)

final_random_forest_classifier.score(test_data,test_target)

In [None]:
final_random_forest_classifier.oob_score_

In [None]:
predict_proba_test_data = final_random_forest_classifier.predict_proba(test_data)

In [None]:
log_loss(y_true=test_target,y_pred=predict_proba_test_data)

In [None]:
data_frame_predict_result = pd.DataFrame(predict_proba_test_data, columns=['Class_' + str(index) for index in range(1,10)])
data_frame_predict_result.insert(loc=0, column='id', value=test_data.index.tolist())
data_frame_predict_result.to_csv('./generation/submission.csv',index=False)

In [None]:
pd.read_csv('./generation/submission.csv')