In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, log_loss
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, PredefinedSplit

# ST

## Common preperation

In [2]:
data_dict = pd.read_pickle('/content/drive/My Drive/CSCW_H1/data/embeds_ST.pickle')

## Logistic Regression

In [3]:
# params
n_folds = 5
ModelName = LogisticRegression

In [4]:
test_y_all = []
test_pred_y_all = []
parameters = {'penalty':('l1','l2'), 'C':(0.01,0.1,10,100),'solver':('liblinear',),'max_iter':(1000,)}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ######## Hyperparameter search#########################
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].tolist()+data_dict[fold]['train_X'].tolist(), 
          data_dict[fold]['val_y'].tolist()+data_dict[fold]['train_y'].tolist())
  print(clf.best_params_)
  ######### Fit-predict with best params#################
  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
fold 1
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
fold 2
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
fold 3
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
fold 4
{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.86      0.19      0.31        63
           1       0.50      0.06      0.11        82
           2       0.69      0.99      0.82       285

    accuracy                           0.70       430
   macro avg       0.68      0.41      0.41       430
weighted avg       0.68      0.70      0.61       430



## SVC with RBF Kernel

In [5]:
# params
n_folds = 5
ModelName = SVC

In [6]:
test_y_all = []
test_pred_y_all = []
parameters = {'C':(0.01,0.1,10,100),'probability':(True,)}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ######## Hyperparameter search#########################
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].tolist()+data_dict[fold]['train_X'].tolist(), 
          data_dict[fold]['val_y'].tolist()+data_dict[fold]['train_y'].tolist())
  print(clf.best_params_)
  ######### Fit-predict with best params#################
  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0
{'C': 100, 'probability': True}
fold 1
{'C': 100, 'probability': True}
fold 2
{'C': 10, 'probability': True}
fold 3
{'C': 100, 'probability': True}
fold 4
{'C': 100, 'probability': True}
              precision    recall  f1-score   support

           0       0.59      0.52      0.55        63
           1       0.46      0.33      0.38        82
           2       0.77      0.85      0.81       285

    accuracy                           0.70       430
   macro avg       0.61      0.57      0.58       430
weighted avg       0.68      0.70      0.69       430



## SVC with Polynomial kernel

In [7]:
# params
n_folds = 5
ModelName = SVC

In [8]:
test_y_all = []
test_pred_y_all = []
parameters = {'C':(0.01,0.1,10,100),'probability':(True,),'kernel':('poly',),'degree':(3,5,7,9)}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ######## Hyperparameter search#########################
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].tolist()+data_dict[fold]['train_X'].tolist(), 
          data_dict[fold]['val_y'].tolist()+data_dict[fold]['train_y'].tolist())
  print(clf.best_params_)
  ######### Fit-predict with best params#################
  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0
{'C': 100, 'degree': 5, 'kernel': 'poly', 'probability': True}
fold 1
{'C': 10, 'degree': 7, 'kernel': 'poly', 'probability': True}
fold 2
{'C': 10, 'degree': 7, 'kernel': 'poly', 'probability': True}
fold 3
{'C': 10, 'degree': 9, 'kernel': 'poly', 'probability': True}
fold 4
{'C': 0.01, 'degree': 3, 'kernel': 'poly', 'probability': True}
              precision    recall  f1-score   support

           0       0.60      0.44      0.51        63
           1       0.37      0.23      0.29        82
           2       0.74      0.87      0.80       285

    accuracy                           0.68       430
   macro avg       0.57      0.51      0.53       430
weighted avg       0.65      0.68      0.66       430



## Neural Network

In [9]:
# params
n_folds = 5
ModelName = MLPClassifier

In [10]:
test_y_all = []
test_pred_y_all = []
parameters = {'hidden_layer_sizes':((8,8),(16,8),(32,8),(64,8)),'max_iter':(500,),'batch_size':(16,32,64),
              'solver':('adam',)}
model = ModelName()
for fold in range(n_folds):
  print('fold',fold)
  ######## Hyperparameter search#########################
  ps = PredefinedSplit([0 for _ in data_dict[fold]['val_X']]+[-1 for _ in data_dict[fold]['train_X']])
  clf = GridSearchCV(model, parameters, cv=ps, scoring='neg_log_loss', refit=False)
  clf.fit(data_dict[fold]['val_X'].tolist()+data_dict[fold]['train_X'].tolist(), 
          data_dict[fold]['val_y'].tolist()+data_dict[fold]['train_y'].tolist())
  print(clf.best_params_)
  ######### Fit-predict with best params#################
  model = ModelName(**clf.best_params_)
  model.fit(data_dict[fold]['train_X'],data_dict[fold]['train_y'])
  test_y_all.extend(data_dict[fold]['test_y'].tolist())
  test_pred_y_all.extend(model.predict(data_dict[fold]['test_X']))
print(classification_report(test_y_all, test_pred_y_all))

fold 0




{'batch_size': 64, 'hidden_layer_sizes': (16, 8), 'max_iter': 500, 'solver': 'adam'}
fold 1
{'batch_size': 32, 'hidden_layer_sizes': (64, 8), 'max_iter': 500, 'solver': 'adam'}
fold 2
{'batch_size': 64, 'hidden_layer_sizes': (32, 8), 'max_iter': 500, 'solver': 'adam'}
fold 3
{'batch_size': 64, 'hidden_layer_sizes': (32, 8), 'max_iter': 500, 'solver': 'adam'}
fold 4
{'batch_size': 16, 'hidden_layer_sizes': (16, 8), 'max_iter': 500, 'solver': 'adam'}
              precision    recall  f1-score   support

           0       0.68      0.54      0.60        63
           1       0.40      0.29      0.34        82
           2       0.78      0.87      0.82       285

    accuracy                           0.71       430
   macro avg       0.62      0.57      0.59       430
weighted avg       0.69      0.71      0.70       430

