# Machine Learning

Based on the preprocessed dataset as created in exploratory analysis, this notebook peroform train set cross validations with Logistics Regression, SVC, Random Forecast, and Extreme Gradient Boosting

In [73]:
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from hyperopt import STATUS_OK, Trials, fmin, hp, rand, tpe
from hyperopt.pyll.base import scope
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import (
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost.sklearn import XGBClassifier

from sklearn.compose import ColumnTransformer


warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
df_clean = pd.read_csv('./data/df_clean.csv', index_col=0)

In [4]:
X = df_clean.loc[:, 'sim':]
y = df_clean['median_relevance']

In [5]:
X = pd.get_dummies(X, columns=['query_len'], drop_first=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=144)

In [7]:
def kappa_score(predictions):
    print(cohen_kappa_score(y_test, predictions, weights='quadratic'))

In [8]:
print(X_train.shape, y_train.shape)

(8126, 837) (8126,)


## Dummy Classifier

In [14]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
pred_dum = dummy.predict(X_test)
kappa_score(pred_dum)

0.0


### Transfomration

Normalize both 'sim' and 'fuzzy' column to allow better fitting to different machine learning models.

In [19]:
col = ['sim', 'fuzzy']
ct = ColumnTransformer([('std_trans', StandardScaler(), col)], remainder='passthrough')
train_x = ct.fit_transform(X_train)
test_x = ct.transform(X_test)

## Logistics Regression

### Base

In [20]:
lr = LogisticRegression(solver='lbfgs', class_weight='balanced', max_iter=1000)
lr.fit(train_x, y_train)
pred_lr = lr.predict(test_x)
kappa_score(pred_lr)

0.4946302247227575


Cross Valiadtion

In [21]:
cohen = make_scorer(cohen_kappa_score, weights='quadratic')

In [22]:
lrcv = LogisticRegressionCV(
    solver='lbfgs', scoring=cohen, cv=6, class_weight='balanced', max_iter=1000)
lrcv.fit(train_x, y_train)
lrcv.score(test_x, y_test)

0.4943136766737083

### Tunning

In [42]:
lr_params = {'penalty': hp.choice('penalty', ['l2']),
             'C': hp.choice('C', [0.001, 0.01, 0.1, 1, 10, 100, 1000]),
             'solver': hp.choice('solver', ['lbfgs', 'newton-cg', 'sag', 'saga']),
             'class_weight': hp.choice('class_weight', ['balanced']),
             'max_iter': scope.int(hp.quniform('max_iter', 500, 3000, 100))}

In [43]:
def obj_lr(params):
    clf = LogisticRegression(**params)
    skfold = StratifiedKFold(n_splits=6, random_state=2020, shuffle=True)
    best_score = cross_val_score(clf, train_x, y_train, scoring=cohen, cv=skfold).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [44]:
best_lr = fmin(fn=obj_lr, space=lr_params, algo=tpe.suggest, max_evals=20, trials=Trials())

 45%|████▌     | 9/20 [01:58<02:59, 16.29s/trial, best loss: -0.4994361541906813] 







 90%|█████████ | 18/20 [33:21<04:44, 142.35s/trial, best loss: -0.4994361541906813]  






100%|██████████| 20/20 [46:45<00:00, 140.30s/trial, best loss: -0.4994361541906813]


In [45]:
best_lr

{'C': 3, 'class_weight': 0, 'max_iter': 2100.0, 'penalty': 0, 'solver': 1}

In [49]:
lr_best = LogisticRegression(
    C=1, class_weight='balanced', solver='newton-cg', max_iter=2100, penalty='l2')
lr_best.fit(train_x, y_train)
pred_lr_best = lr_best.predict(test_x)
kappa_score(pred_lr_best)

0.4946302247227575


## Support Vector Machine

### Base

In [50]:
svc_rbf = SVC(kernel='rbf', gamma=1, C=1,
              decision_function_shape='ovo', class_weight='balanced')
svc_fit = svc_rbf.fit(train_x, y_train)
pred_rbf = svc_fit.predict(test_x)
kappa_score(pred_rbf)

0.5112333927669344


Cross Validation

In [57]:
skf = StratifiedKFold(n_splits=6, random_state=44)
scaler = StandardScaler()

In [78]:
def cv_func(cv_list, clf):
    """ take a classifier, perform kfold validation and add mean score to a list
    
    args:
        cv_list: list to be used to hold cv mean score
        clf: classifier being used
    """
    
    for train_idx, test_idx in skf.split(X, y):
        train_set = scaler.fit_transform(X.iloc[train_idx])
        test_set = scaler.transform(X.iloc[test_idx])
        fit_svc = clf.fit(train_set, y.iloc[train_idx])
        pred_fit = clf.predict(test_set)
        cv_list.append(cohen_kappa_score(y.iloc[test_idx], pred_fit, weights='quadratic'))

In [59]:
svc_cv = []
cv_func(svc_cv, svc_rbf)

In [62]:
print(np.mean(svc_cv))

0.5220749771762127


### Tunning

In [64]:
svc_params = {'kernel': hp.choice('kernel', ['rbf']),
              'C': hp.choice('C', [0.1, 1, 10, 100, 1000]),
              'gamma': hp.choice('gamma', [0.1, 1, 10, 100]),
             'degree': hp.choice('degree', [0, 2, 4, 6])}

In [67]:
def obj_svc(params):
    clf = SVC(decision_function_shape='ovo', class_weight='balanced', **params)
    skfold = StratifiedKFold(n_splits=6, random_state=2020, shuffle=True)
    best_score = cross_val_score(
        clf, train_x, y_train, scoring=cohen, cv=skfold).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [68]:
best_svc = fmin(fn=obj_svc, space=svc_params, algo=tpe.suggest, max_evals=10)

100%|██████████| 10/10 [1:06:38<00:00, 399.88s/trial, best loss: -0.5042102450204642]


In [69]:
best_svc

{'C': 2, 'degree': 2, 'gamma': 0, 'kernel': 0}

In [71]:
svc_best = SVC(kernel='rbf', gamma=0.1, C=10,
              decision_function_shape='ovo', class_weight='balanced', degree=2)
svc_bestfit = svc_best.fit(train_x, y_train)
pred_svc_best = svc_bestfit.predict(test_x)
kappa_score(pred_svc_best)

0.5144466416684622


## Extra Tree Classifier

### Base

In [75]:
et = ExtraTreesClassifier(
    n_estimators=1000, min_samples_split=15).fit(train_x, y_train)
et_fit = et.fit(train_x, y_train)
pred_et = et_fit.predict(test_x)
kappa_score(pred_et)

0.578275443573612


Cross Valiadtion

In [79]:
et_cv =[]
cv_func(et_cv, et)

In [80]:
print(np.mean(et_cv))

0.5639095114404307


### Tunning

In [81]:
et_params = {'n_estimators': scope.int(hp.quniform('n_estimators', 500, 3000, 100)),
             'min_samples_split':  hp.choice('min_samples_split', [2, 5, 10]),
             'min_samples_leaf':  hp.choice('min_samples_leaf', [1, 3, 9]),
             'bootstrap': hp.choice('bootstrap', [True, False])
            }

In [82]:
def obj_et(params):
    clf = ExtraTreesClassifier(**params)
    skfold = StratifiedKFold(n_splits=6, random_state=2020, shuffle=True)
    best_score = cross_val_score(
        clf, train_x, y_train, scoring=cohen, cv=skfold).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [85]:
best_et = fmin(fn=obj_et, space=et_params, algo=tpe.suggest, max_evals=10)

100%|██████████| 10/10 [1:31:06<00:00, 546.62s/trial, best loss: -0.5554087720520255]


In [86]:
best_et

{'bootstrap': 0,
 'min_samples_leaf': 0,
 'min_samples_split': 1,
 'n_estimators': 2600.0}

In [87]:
et_best = ExtraTreesClassifier(
    n_estimators=2600, min_samples_split=5, bootstrap=True, min_samples_leaf=1).fit(train_x, y_train)
et_bestfit = et_best.fit(train_x, y_train)
pred_et_best = et_bestfit.predict(test_x)
kappa_score(pred_et_best)

0.5674791534666188


## Random Forest

### Base

In [89]:
rf = RandomForestClassifier(n_estimators=1000, min_samples_split=15, min_samples_leaf=10)
rf.fit(train_x, y_train)
pred_rf = rf.predict(test_x)
kappa_score(pred_rf)

0.554882821276191


Cross Validation

In [90]:
rf_cv =[]
cv_func(rf_cv, rf)

In [92]:
print(np.mean(rf_cv))

0.564400327901409


### Tunning

In [101]:
rf_params = {'n_estimators': scope.int(hp.quniform('n_estimators', 500, 3000, 100)),
             'min_samples_split': hp.choice('min_samples_split', [2, 6, 18, 32]),
             'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 10, 100, 10)),
             'bootstrap': hp.choice('bootstrap', [True, False])
            }

In [102]:
def obj_rf(params):
    clf = RandomForestClassifier(**params)
    skfold = StratifiedKFold(n_splits=6, random_state=2020, shuffle=True)
    best_score = cross_val_score(
        clf, train_x, y_train, scoring=cohen, cv=skfold).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [None]:
best_rf = fmin(fn=obj_rf, space=rf_params, algo=tpe.suggest, max_evals=10)

 30%|███       | 3/10 [11:17<24:49, 212.72s/trial, best loss: -0.020918877876076714]

In [None]:
best_rf

## XGBoost

### Base

In [21]:
xgb = XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=27)
xgb.fit(train_x, y_train)
pred_xgb = xgb.predict(test_x)
kappa_score(pred_xgb)

0.5602492400641114
