In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, log_loss
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, PredefinedSplit

# OE

## Common preperation

In [None]:
data_dict = pd.read_pickle('/content/drive/My Drive/CSCW_H1/data/embeds_OE_extra.pickle')

## Logistic Regression

In [None]:
# params
n_folds = 5
ModelName = LogisticRegression

In [None]:
test_y_all = []
test_pred_y_all = []
parameters = {'penalty':('l1','l2'), 'C':(0.01,0.1,10,100),'solver':('liblinear',),'max_iter':(1000,)}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ######## Hyperparameter search#########################
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].tolist()+data_dict[fold]['train_X'].tolist(), 
          data_dict[fold]['val_y'].tolist()+data_dict[fold]['train_y'].tolist())
  print(clf.best_params_)
  ######### Fit-predict with best params#################
  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
fold 1
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
fold 2
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
fold 3
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
fold 4
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.41      0.08      0.13       187
           1       0.65      0.96      0.78       668
           2       0.58      0.20      0.29       238

    accuracy                           0.64      1093
   macro avg       0.55      0.41      0.40      1093
weighted avg       0.60      0.64      0.56      1093



## SVC with RBF Kernel

In [None]:
# params
n_folds = 5
ModelName = SVC

In [None]:
test_y_all = []
test_pred_y_all = []
parameters = {'C':(0.01,0.1,10,100),'probability':(True,)}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ######## Hyperparameter search#########################
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].tolist()+data_dict[fold]['train_X'].tolist(), 
          data_dict[fold]['val_y'].tolist()+data_dict[fold]['train_y'].tolist())
  print(clf.best_params_)
  ######### Fit-predict with best params#################
  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0
{'C': 0.1, 'probability': True}
fold 1
{'C': 0.1, 'probability': True}
fold 2
{'C': 0.1, 'probability': True}
fold 3
{'C': 10, 'probability': True}
fold 4
{'C': 10, 'probability': True}
              precision    recall  f1-score   support

           0       0.33      0.03      0.06       187
           1       0.63      0.98      0.77       668
           2       0.68      0.09      0.16       238

    accuracy                           0.62      1093
   macro avg       0.55      0.37      0.33      1093
weighted avg       0.59      0.62      0.51      1093



## SVC with Polynomial kernel

In [None]:
# params
n_folds = 5
ModelName = SVC

In [None]:
test_y_all = []
test_pred_y_all = []
parameters = {'C':(0.01,0.1,10,100),'probability':(True,),'kernel':('poly',),'degree':(3,5,7,9)}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ######## Hyperparameter search#########################
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].tolist()+data_dict[fold]['train_X'].tolist(), 
          data_dict[fold]['val_y'].tolist()+data_dict[fold]['train_y'].tolist())
  print(clf.best_params_)
  ######### Fit-predict with best params#################
  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0
{'C': 0.1, 'degree': 3, 'kernel': 'poly', 'probability': True}
fold 1
{'C': 0.1, 'degree': 3, 'kernel': 'poly', 'probability': True}
fold 2
{'C': 0.1, 'degree': 3, 'kernel': 'poly', 'probability': True}
fold 3
{'C': 10, 'degree': 7, 'kernel': 'poly', 'probability': True}
fold 4
{'C': 0.01, 'degree': 3, 'kernel': 'poly', 'probability': True}
              precision    recall  f1-score   support

           0       0.35      0.05      0.08       187
           1       0.63      0.97      0.76       668
           2       0.53      0.07      0.13       238

    accuracy                           0.62      1093
   macro avg       0.50      0.36      0.32      1093
weighted avg       0.56      0.62      0.51      1093



## Neural Network

In [None]:
# params
n_folds = 5
ModelName = MLPClassifier

In [None]:
test_y_all = []
test_pred_y_all = []
parameters = {'hidden_layer_sizes':((8,8),(16,8),(32,8),(64,8)),'max_iter':(500,),'batch_size':(16,32,64),
              'solver':('adam',)}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ######## Hyperparameter search#########################
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].tolist()+data_dict[fold]['train_X'].tolist(), 
          data_dict[fold]['val_y'].tolist()+data_dict[fold]['train_y'].tolist())
  print(clf.best_params_)
  ######### Fit-predict with best params#################
  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0
{'batch_size': 32, 'hidden_layer_sizes': (16, 8), 'max_iter': 500, 'solver': 'adam'}
fold 1




{'batch_size': 64, 'hidden_layer_sizes': (64, 8), 'max_iter': 500, 'solver': 'adam'}
fold 2
{'batch_size': 32, 'hidden_layer_sizes': (64, 8), 'max_iter': 500, 'solver': 'adam'}




{'batch_size': 32, 'hidden_layer_sizes': (32, 8), 'max_iter': 500, 'solver': 'adam'}
fold 4




{'batch_size': 16, 'hidden_layer_sizes': (8, 8), 'max_iter': 500, 'solver': 'adam'}
              precision    recall  f1-score   support

           0       0.33      0.35      0.34       187
           1       0.71      0.72      0.71       668
           2       0.36      0.32      0.34       238

    accuracy                           0.57      1093
   macro avg       0.46      0.46      0.46      1093
weighted avg       0.57      0.57      0.57      1093

