In [24]:
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from hyperopt import STATUS_OK, Trials, fmin, hp, rand, tpe
from hyperopt.pyll.base import scope
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df_clean = pd.read_csv('./data/df_clean.csv', index_col=0)

In [3]:
X = df_clean.loc[:, 'prod_fuzzy':]
y = df_clean['target']

In [4]:
X.head()

Unnamed: 0_level_0,prod_fuzzy,desc_fuzzy,prod_desc,query_len,coffee,foam,memory,ray,gb,phone,...,francisco,prom,clothes,case,bay,note,comforter,tick,control,toe
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,44,44,68,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,85,50,75,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,100,50,25,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,100,56,43,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,100,100,58,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=144)

In [6]:
def kappa_score(predictions):
    print(cohen_kappa_score(y_test, predictions, weights='quadratic'))

In [7]:
print(X_train.shape, y_train.shape)

(8126, 34) (8126,)


In [8]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
pred_dum = dummy.predict(X_test)
kappa_score(pred_dum)

0.0


In [9]:
scaler = StandardScaler()
train_x = scaler.fit_transform(X_train)
test_x = scaler.transform(X_test)

In [10]:
lr = LogisticRegression(solver='lbfgs', class_weight='balanced')
lr.fit(train_x, y_train)
pred_lr = lr.predict(test_x)
kappa_score(pred_lr)

0.34582213047745636


In [11]:
cohen = make_scorer(cohen_kappa_score, weights='quadratic')

In [12]:
lrcv = LogisticRegressionCV(
    solver='lbfgs', scoring=cohen, cv=6, class_weight='balanced', max_iter=1000)
lrcv.fit(train_x, y_train)
lrcv.score(test_x, y_test)

0.3455824434136241

In [13]:
lr_params = {'penalty': hp.choice('penalty', ['l2']),
             'C': hp.choice('C', [0.001, 0.01, 0.1, 1, 10, 100, 1000]),
             'solver': hp.choice('solver', ['lbfgs', 'newton-cg', 'sag', 'saga']),
             'class_weight': hp.choice('class_weight', ['balanced']),
             'max_iter': scope.int(hp.quniform('max_iter', 100, 1000, 10))}

In [14]:
def obj_lr(params):
    clf = LogisticRegression(**params)
    best_score = cross_val_score(
        clf, train_x, y_train, scoring=cohen, cv=6).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [15]:
# best_lr = fmin(fn=obj_lr, space=lr_params, rstate=np.random.RandomState(
#     44), algo=tpe.suggest, max_evals=50)

In [17]:
# best_lr

In [19]:
lr_best = LogisticRegression(
    C=4, class_weight='balanced', solver='saga', max_iter=900, penalty='l2')
lr_best.fit(train_x, y_train)
pred_lr_best = lr_best.predict(test_x)
kappa_score(pred_lr_best)

0.3412888521887424




In [20]:
svc_rbf = SVC(kernel='rbf', gamma=1, C=1,
              decision_function_shape='ovo', class_weight='balanced')
svc_fit = svc_rbf.fit(train_x, y_train)

In [21]:
pred_rbf = svc_fit.predict(test_x)
kappa_score(pred_rbf)

0.40206860663611144


In [22]:
# skf = StratifiedKFold(n_splits=6, random_state=44)

In [None]:
# cv_score = []
# for train_idx, test_idx in skf.split(X, y):
#     train_set = scaler.fit_transform(X.iloc[train_idx])
#     test_set = scaler.transform(X.iloc[test_idx])
#     fit_svc = svc_rbf.fit(train_set, y.iloc[train_idx])
#     pred_fit = fit_svc.predict(test_set)
#     print(cohen_kappa_score(y.iloc[test_idx], pred_fit, weights='quadratic'))

In [None]:
svc_params = {'kernel': hp.choice('kernel', ['rbf']),
              'C': hp.choice('C', [0.1, 1, 10, 100, 1000]),
              'gamma': hp.choice('gamma', [0.1, 1, 10, 100])}
#              'degree': hp.choice('degree', [0, 1, 2, 3, 4, 5, 6])}

In [None]:
def obj_svc(params):
    clf = SVC(decision_function_shape='ovo', class_weight='balanced', **params)
    best_score = cross_val_score(
        clf, train_x, y_train, scoring=cohen, cv=6).mean()
    return {'loss': -best_score, 'status': STATUS_OK}

In [None]:
# best_svc = fmin(fn=obj_svc, space=svc_params, rstate=np.random.RandomState(
#     44), algo=tpe.suggest, max_evals=50)

In [None]:
best_svc

In [23]:
xgb = XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=27)
xgb.fit(train_x, y_train)
pred_xgb = xgb.predict(test_x)
kappa_score(pred_xgb)

0.3991063942555506
