In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

## Load in data

In [2]:
train = pd.read_csv('train_ml2_2021.csv')
train.head()

Unnamed: 0,problem_id,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v969,v970,v971,v972,v973,v974,v975,v976,v977,target
0,0,0.35,0.36,0.77,0.42,0.48,0.12,0.43,0.49,0.19,...,0.77,0.57,0.56,0.4,0.75,0.08,0.14,0.43,0.88,0
1,0,0.12,0.17,0.38,0.43,0.57,0.09,0.11,0.58,0.35,...,0.07,0.59,0.06,0.17,0.09,0.04,0.04,0.45,0.14,1
2,0,0.67,0.16,0.85,0.41,0.57,0.27,0.83,0.73,0.26,...,0.24,0.42,0.57,0.58,0.37,0.01,0.1,0.38,0.06,1
3,0,0.7,0.2,0.62,0.41,0.41,0.1,0.8,0.52,0.82,...,0.04,0.52,0.25,0.07,0.23,0.04,0.76,0.41,0.59,1
4,0,0.72,0.75,0.74,0.42,0.41,0.76,0.34,0.72,0.06,...,0.81,0.41,0.16,0.03,0.21,0.1,0.12,0.53,0.3,1


In [3]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

In [4]:
X_train_train, X_val, y_train_train, y_val = train_test_split(X_train, y_train.ravel(), stratify=y_train)

In [5]:
test = pd.read_csv('test0.csv')
test.head()

Unnamed: 0,obs_id,problem_id,v0,v1,v2,v3,v4,v5,v6,v7,...,v969,v970,v971,v972,v973,v974,v975,v976,v977,target
0,0,2,0.6,0.05,0.5,0.79,0.06,0.72,0.51,0.34,...,0.5,0.34,0.72,0.12,0.66,0.75,0.52,0.74,0.35,0
1,1,2,0.55,0.08,0.62,0.52,0.05,0.46,0.2,0.11,...,0.68,0.68,0.78,0.17,0.45,0.5,0.59,0.57,0.74,0
2,2,2,0.35,0.85,0.42,0.39,0.04,0.68,0.54,0.55,...,0.77,0.4,0.72,0.79,0.29,0.47,0.49,0.75,0.63,0
3,3,2,0.45,0.63,0.42,0.58,0.03,0.83,0.73,0.23,...,0.76,0.42,0.3,0.06,0.4,0.56,0.42,0.81,0.54,0
4,4,2,0.47,0.11,0.45,0.78,0.43,0.57,0.66,0.39,...,0.69,0.76,0.34,0.1,0.61,0.56,0.4,0.4,0.45,0


In [6]:
X_test = test.iloc[:, 1:-1]
y_test = test.iloc[:, -1]

### Examine data

In [7]:
train.target.value_counts()

0    4124
1    3175
2     729
3     255
4      19
Name: target, dtype: int64

In [8]:
train.groupby(['problem_id', 'target']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v968,v969,v970,v971,v972,v973,v974,v975,v976,v977
problem_id,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,262,262,262,262,262,262,262,262,262,262,...,262,262,262,262,262,262,262,262,262,262
0,1,107,107,107,107,107,107,107,107,107,107,...,107,107,107,107,107,107,107,107,107,107
1,0,188,188,188,188,188,188,188,188,188,188,...,188,188,188,188,188,188,188,188,188,188
1,1,130,130,130,130,130,130,130,130,130,130,...,130,130,130,130,130,130,130,130,130,130
1,2,31,31,31,31,31,31,31,31,31,31,...,31,31,31,31,31,31,31,31,31,31
1,3,59,59,59,59,59,59,59,59,59,59,...,59,59,59,59,59,59,59,59,59,59
2,0,131,131,131,131,131,131,131,131,131,131,...,131,131,131,131,131,131,131,131,131,131
2,1,277,277,277,277,277,277,277,277,277,277,...,277,277,277,277,277,277,277,277,277,277
3,0,130,130,130,130,130,130,130,130,130,130,...,130,130,130,130,130,130,130,130,130,130
3,1,278,278,278,278,278,278,278,278,278,278,...,278,278,278,278,278,278,278,278,278,278


## Model hyperparameter tuning

In [9]:
# baseline performance without hyperparameter tuning
pipe = Pipeline([('smote', SMOTE()), 
                 ('onevsrestclassifier', OneVsRestClassifier(RandomForestClassifier()))])
pipe.fit(X_train_train, y_train_train)

Pipeline(steps=[('smote', SMOTE()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=RandomForestClassifier()))])

In [10]:
y_pred_val = pipe.predict_proba(X_val)
print('Log loss on validation: ', log_loss(y_val, y_pred_val))
y_hat_val = pipe.predict(X_val)
print('Accuracy on validation: ', accuracy_score(y_val, y_hat_val))

Log loss on validation:  0.7792558059042825
Accuracy on validation:  0.7153179190751445


In [11]:
# hyperparameter grid for RandomForestClassifier
n_estimators = [25, 50, 100, 150, 200, 250]
max_depth = [5, 10, 20, 25, None]
min_samples_split = [2, 4, 6, 8, 10]
min_samples_leaf = [1, 3, 5, 7, 9]
max_samples = [0.25, 0.33, 0.5, 0.66, 0.75, None]
class_weight = ['balanced', 'balanced_subsample', None]
max_features = ['sqrt', 'log2', None]
bootstrap = [True, False]
n_jobs = [-1]

# hyperparameter grid for SMOTE
sampling_strategy = ['minority', 'not majority']
k_neighbors = [1, 3, 5, 7, 9]


param_grid = [{'onevsrestclassifier__estimator__n_estimators': n_estimators,
               'onevsrestclassifier__estimator__max_depth': max_depth,
               'onevsrestclassifier__estimator__min_samples_split': min_samples_split,
               'onevsrestclassifier__estimator__min_samples_leaf': min_samples_leaf,
               'onevsrestclassifier__estimator__max_samples': max_samples,
               'onevsrestclassifier__estimator__class_weight' : class_weight,
               'onevsrestclassifier__estimator__max_features': max_features,
               'onevsrestclassifier__estimator__bootstrap': bootstrap,
               'onevsrestclassifier__estimator__n_jobs': n_jobs, 
               'smote__sampling_strategy': sampling_strategy, 
               'smote__k_neighbors': k_neighbors}]

In [35]:
cv = RandomizedSearchCV(estimator=pipe, 
                        param_distributions=param_grid, 
                        n_iter=10,
                        cv=5, 
                        scoring='neg_log_loss',
                        n_jobs=-1,
                        verbose=10)

best_model = cv.fit(X_train_train, y_train_train)
y_pred = cv.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))
print(best_model.best_estimator_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy: 0.6970134874759152
Pipeline(steps=[('smote', SMOTE(k_neighbors=9, sampling_strategy='minority')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=RandomForestClassifier(class_weight='balanced_subsample',
                                                                      max_features=None,
                                                                      max_samples=0.66,
                                                                      min_samples_leaf=3,
                                                                      min_samples_split=6,
                                                                      n_estimators=250,
                                                                      n_jobs=-1)))])


In [12]:
pipe = Pipeline([('smote', SMOTE(k_neighbors=9, sampling_strategy='minority')),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=RandomForestClassifier(class_weight='balanced_subsample',
                                                                      max_features=None,
                                                                      max_samples=0.66,
                                                                      min_samples_leaf=3,
                                                                      min_samples_split=6,
                                                                      n_estimators=250,
                                                                      n_jobs=-1)))])
pipe.fit(X_train_train, y_train_train)
y_pred_val = pipe.predict(X_val)
print('Accuracy on validation: ', accuracy_score(y_val, y_pred_val))

Accuracy on validation:  0.7162813102119461


In [13]:
y_pred_val = pipe.predict_proba(X_val)
print('Log loss on validation: ', log_loss(y_val, y_pred_val))

Log loss on validation:  0.6723499970286184


In [46]:
pipe.fit(X_train, y_train)
y_pred_test = pipe.predict(X_test)

In [47]:
test['target'] = y_pred_test
submission = test[['obs_id', 'target']]
submission.to_csv('submissions/submission_smote_ovr.csv', index=None)