In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

## Load in data

In [2]:
train = pd.read_csv('train_ml2_2021.csv')
train.head()

Unnamed: 0,problem_id,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v969,v970,v971,v972,v973,v974,v975,v976,v977,target
0,0,0.35,0.36,0.77,0.42,0.48,0.12,0.43,0.49,0.19,...,0.77,0.57,0.56,0.4,0.75,0.08,0.14,0.43,0.88,0
1,0,0.12,0.17,0.38,0.43,0.57,0.09,0.11,0.58,0.35,...,0.07,0.59,0.06,0.17,0.09,0.04,0.04,0.45,0.14,1
2,0,0.67,0.16,0.85,0.41,0.57,0.27,0.83,0.73,0.26,...,0.24,0.42,0.57,0.58,0.37,0.01,0.1,0.38,0.06,1
3,0,0.7,0.2,0.62,0.41,0.41,0.1,0.8,0.52,0.82,...,0.04,0.52,0.25,0.07,0.23,0.04,0.76,0.41,0.59,1
4,0,0.72,0.75,0.74,0.42,0.41,0.76,0.34,0.72,0.06,...,0.81,0.41,0.16,0.03,0.21,0.1,0.12,0.53,0.3,1


In [4]:
problem_dfs = []
for i in range(21):
    problem_dfs.append(train[train['problem_id'] == i])

In [6]:
X_dfs = []
y_dfs = []
for df in problem_dfs:
    X_dfs.append(df.iloc[:, :-1])
    y_dfs.append(df.iloc[:, -1])

In [7]:
X_train_dfs = []
y_train_dfs = []
X_val_dfs = []
y_val_dfs = []
for X_df, y_df in zip(X_dfs, y_dfs):
    X_train, X_val, y_train, y_val = train_test_split(X_df, y_df.ravel(), stratify=y_df)
    X_train_dfs.append(X_train)
    y_train_dfs.append(y_train)
    X_val_dfs.append(X_val)
    y_val_dfs.append(y_val)

In [8]:
test = pd.read_csv('test0.csv')
test.head()

Unnamed: 0,obs_id,problem_id,v0,v1,v2,v3,v4,v5,v6,v7,...,v969,v970,v971,v972,v973,v974,v975,v976,v977,target
0,0,2,0.6,0.05,0.5,0.79,0.06,0.72,0.51,0.34,...,0.5,0.34,0.72,0.12,0.66,0.75,0.52,0.74,0.35,0
1,1,2,0.55,0.08,0.62,0.52,0.05,0.46,0.2,0.11,...,0.68,0.68,0.78,0.17,0.45,0.5,0.59,0.57,0.74,0
2,2,2,0.35,0.85,0.42,0.39,0.04,0.68,0.54,0.55,...,0.77,0.4,0.72,0.79,0.29,0.47,0.49,0.75,0.63,0
3,3,2,0.45,0.63,0.42,0.58,0.03,0.83,0.73,0.23,...,0.76,0.42,0.3,0.06,0.4,0.56,0.42,0.81,0.54,0
4,4,2,0.47,0.11,0.45,0.78,0.43,0.57,0.66,0.39,...,0.69,0.76,0.34,0.1,0.61,0.56,0.4,0.4,0.45,0


In [9]:
X_test = test.iloc[:, 1:-1]
y_test = test.iloc[:, -1]

In [50]:
model = VotingClassifier(estimators=[('lrc', LogisticRegression(multi_class='multinomial')), 
#                                      ('rrc', RidgeClassifier()), 
                                     ('bnc', BernoulliNB()), 
                                     ('gnc', GaussianNB()), 
                                     ('dtc', DecisionTreeClassifier()), 
                                     ('etc', ExtraTreeClassifier()), 
                                     ('rfc', RandomForestClassifier()),
                                     ('etcs', ExtraTreesClassifier())], 
                         voting='soft')

In [51]:
models = []
for X_train, y_train in zip(X_train_dfs, y_train_dfs):
    models.append(model.fit(X_train_res, y_train_res))

In [53]:
y_preds = []
for i, model in enumerate(models):
#     y_pred_prob = model.predict_proba(X_val_dfs[i])
    y_pred = model.predict(X_val_dfs[i])
    for pred in y_pred:
        y_preds.append(pred)
#     print('log loss: ', log_loss(y_val_dfs[i], y_pred_prob))
    print('accuracy: ', accuracy_score(y_val_dfs[i], y_pred))

accuracy:  0.6129032258064516
accuracy:  0.4803921568627451
accuracy:  0.3431372549019608
accuracy:  0.22549019607843138
accuracy:  0.6206896551724138
accuracy:  0.20430107526881722
accuracy:  0.25252525252525254
accuracy:  0.5979381443298969
accuracy:  0.3225806451612903
accuracy:  0.6022727272727273
accuracy:  0.875
accuracy:  0.6282051282051282
accuracy:  0.9
accuracy:  0.6715328467153284
accuracy:  0.37254901960784315
accuracy:  0.9803921568627451
accuracy:  0.45255474452554745
accuracy:  0.3977272727272727
accuracy:  0.576271186440678
accuracy:  0.2796610169491525
accuracy:  0.9411764705882353


In [58]:
y_targ = []
for y_val in y_val_dfs:
    for targ in y_val:
        y_targ.append(targ)
accuracy_score(y_targ, y_preds)

0.5302593659942363

In [60]:
models = []
for X_train, y_train in zip(X_train_dfs, y_train_dfs):
    X_train_res, y_train_res = SMOTE(sampling_strategy='minority').fit_resample(X_train, y_train)
    models.append(model.fit(X_train_res, y_train_res))

In [61]:
y_preds = []
for i, model in enumerate(models):
#     y_pred_prob = model.predict_proba(X_val_dfs[i])
    y_pred = model.predict(X_val_dfs[i])
    for pred in y_pred:
        y_preds.append(pred)
#     print('log loss: ', log_loss(y_val_dfs[i], y_pred_prob))
    print('accuracy: ', accuracy_score(y_val_dfs[i], y_pred))

accuracy:  0.6559139784946236
accuracy:  0.45098039215686275
accuracy:  0.37254901960784315
accuracy:  0.20588235294117646
accuracy:  0.8275862068965517
accuracy:  0.25806451612903225
accuracy:  0.29292929292929293
accuracy:  0.5567010309278351
accuracy:  0.3118279569892473
accuracy:  0.6477272727272727
accuracy:  0.859375
accuracy:  0.5769230769230769
accuracy:  0.8875
accuracy:  0.6934306569343066
accuracy:  0.39215686274509803
accuracy:  0.9607843137254902
accuracy:  0.49635036496350365
accuracy:  0.3522727272727273
accuracy:  0.6610169491525424
accuracy:  0.3813559322033898
accuracy:  0.9607843137254902


In [63]:
y_targ = []
for y_val in y_val_dfs:
    for targ in y_val:
        y_targ.append(targ)
accuracy_score(y_targ, y_preds)

0.5547550432276657

In [64]:
models = []
for X_train, y_train in zip(X_train_dfs, y_train_dfs):
    X_train_res, y_train_res = SMOTE(sampling_strategy='not majority').fit_resample(X_train, y_train)
    models.append(model.fit(X_train_res, y_train_res))

In [65]:
y_preds = []
for i, model in enumerate(models):
#     y_pred_prob = model.predict_proba(X_val_dfs[i])
    y_pred = model.predict(X_val_dfs[i])
    for pred in y_pred:
        y_preds.append(pred)
#     print('log loss: ', log_loss(y_val_dfs[i], y_pred_prob))
    print('accuracy: ', accuracy_score(y_val_dfs[i], y_pred))

accuracy:  0.6129032258064516
accuracy:  0.47058823529411764
accuracy:  0.4117647058823529
accuracy:  0.22549019607843138
accuracy:  0.7471264367816092
accuracy:  0.3010752688172043
accuracy:  0.25252525252525254
accuracy:  0.5876288659793815
accuracy:  0.3333333333333333
accuracy:  0.6704545454545454
accuracy:  0.859375
accuracy:  0.6153846153846154
accuracy:  0.85
accuracy:  0.6642335766423357
accuracy:  0.37254901960784315
accuracy:  0.9607843137254902
accuracy:  0.45985401459854014
accuracy:  0.375
accuracy:  0.6271186440677966
accuracy:  0.3305084745762712
accuracy:  0.9607843137254902


In [66]:
y_targ = []
for y_val in y_val_dfs:
    for targ in y_val:
        y_targ.append(targ)
accuracy_score(y_targ, y_preds)

0.547550432276657

In [67]:
model = RandomForestClassifier()

In [68]:
models = []
for X_train, y_train in zip(X_train_dfs, y_train_dfs):
    models.append(model.fit(X_train, y_train))

In [69]:
y_preds = []
for i, model in enumerate(models):
#     y_pred_prob = model.predict_proba(X_val_dfs[i])
    y_pred = model.predict(X_val_dfs[i])
    for pred in y_pred:
        y_preds.append(pred)
#     print('log loss: ', log_loss(y_val_dfs[i], y_pred_prob))
    print('accuracy: ', accuracy_score(y_val_dfs[i], y_pred))

accuracy:  0.6236559139784946
accuracy:  0.46078431372549017
accuracy:  0.38235294117647056
accuracy:  0.27450980392156865
accuracy:  0.9195402298850575
accuracy:  0.21505376344086022
accuracy:  0.2222222222222222
accuracy:  0.6185567010309279
accuracy:  0.3548387096774194
accuracy:  0.6704545454545454
accuracy:  0.84375
accuracy:  0.6282051282051282
accuracy:  0.925
accuracy:  0.6715328467153284
accuracy:  0.35294117647058826
accuracy:  0.9607843137254902
accuracy:  0.45985401459854014
accuracy:  0.45454545454545453
accuracy:  0.6271186440677966
accuracy:  0.3898305084745763
accuracy:  0.9607843137254902


In [70]:
y_targ = []
for y_val in y_val_dfs:
    for targ in y_val:
        y_targ.append(targ)
accuracy_score(y_targ, y_preds)

0.5619596541786743

In [71]:
models = []
for X_train, y_train in zip(X_train_dfs, y_train_dfs):
    X_train_res, y_train_res = SMOTE(sampling_strategy='minority').fit_resample(X_train, y_train)
    models.append(model.fit(X_train_res, y_train_res))

In [72]:
y_preds = []
for i, model in enumerate(models):
#     y_pred_prob = model.predict_proba(X_val_dfs[i])
    y_pred = model.predict(X_val_dfs[i])
    for pred in y_pred:
        y_preds.append(pred)
#     print('log loss: ', log_loss(y_val_dfs[i], y_pred_prob))
    print('accuracy: ', accuracy_score(y_val_dfs[i], y_pred))

accuracy:  0.6344086021505376
accuracy:  0.4803921568627451
accuracy:  0.4019607843137255
accuracy:  0.2647058823529412
accuracy:  0.7816091954022989
accuracy:  0.3333333333333333
accuracy:  0.35353535353535354
accuracy:  0.5773195876288659
accuracy:  0.3333333333333333
accuracy:  0.7727272727272727
accuracy:  0.875
accuracy:  0.6025641025641025
accuracy:  0.8875
accuracy:  0.6788321167883211
accuracy:  0.38235294117647056
accuracy:  0.9705882352941176
accuracy:  0.5109489051094891
accuracy:  0.375
accuracy:  0.6271186440677966
accuracy:  0.4067796610169492
accuracy:  0.9607843137254902


In [73]:
y_targ = []
for y_val in y_val_dfs:
    for targ in y_val:
        y_targ.append(targ)
accuracy_score(y_targ, y_preds)

0.5730067243035543

In [74]:
models = []
for X_train, y_train in zip(X_train_dfs, y_train_dfs):
    X_train_res, y_train_res = SMOTE(sampling_strategy='not majority').fit_resample(X_train, y_train)
    models.append(model.fit(X_train_res, y_train_res))

In [75]:
y_preds = []
for i, model in enumerate(models):
#     y_pred_prob = model.predict_proba(X_val_dfs[i])
    y_pred = model.predict(X_val_dfs[i])
    for pred in y_pred:
        y_preds.append(pred)
#     print('log loss: ', log_loss(y_val_dfs[i], y_pred_prob))
    print('accuracy: ', accuracy_score(y_val_dfs[i], y_pred))

accuracy:  0.6236559139784946
accuracy:  0.4411764705882353
accuracy:  0.4019607843137255
accuracy:  0.27450980392156865
accuracy:  0.8045977011494253
accuracy:  0.27956989247311825
accuracy:  0.35353535353535354
accuracy:  0.6288659793814433
accuracy:  0.3225806451612903
accuracy:  0.7386363636363636
accuracy:  0.875
accuracy:  0.6025641025641025
accuracy:  0.875
accuracy:  0.6861313868613139
accuracy:  0.3627450980392157
accuracy:  0.9607843137254902
accuracy:  0.45255474452554745
accuracy:  0.36363636363636365
accuracy:  0.635593220338983
accuracy:  0.3728813559322034
accuracy:  0.9607843137254902


In [76]:
y_targ = []
for y_val in y_val_dfs:
    for targ in y_val:
        y_targ.append(targ)
accuracy_score(y_targ, y_preds)

0.5629202689721422