In [None]:
# imports

# extra
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# local
from model_helpers import *

In [None]:
train, test = load_data()

In [None]:
X = train.drop('cuisine', axis=1)
y = train['cuisine']

In [None]:
X_train, y_train, X_val, y_val = train_validate_split(train)

In [None]:
dtc = DecisionTreeClassifier(max_depth=None)
dtc_grid = {
    'criterion': ['gini'], # gini
    'class_weight': [None], # None
    'min_samples_split': [10], # 2-60
    'min_samples_leaf': [40], # 40
} # best: smoothing=.6: 75.4

dtcabc = DecisionTreeClassifier(max_depth=1, criterion='gini', min_samples_split=2, min_samples_leaf=2, class_weight=None)
abc = AdaBoostClassifier(base_estimator=dtcabc)
abc_grid = {
    'n_estimators': [60], # 60
    'learning_rate': [.5] # .5
} # best: smoothing=.6: 69.4

rfc = RandomForestClassifier(max_depth=None, random_state=1)
rfc_grid = {
    'min_samples_split': [2], # 2
    'min_samples_leaf': [1], # 1
    'n_estimators': [200], # 200
    'class_weight': [None], # None
    'criterion': ['gini'] #
} # best: smoothing=.6: 78.1

xgc = XGBClassifier(seed=1, num_class=20)
xgc_grid = {
    'objective': ['reg:logistic'], # reg:logistic, multi:softmax
    'booster': ['dart'], # dart
    'max_depth': [10], # 5, 10, 20
    'lambda': [1], # 1, 2, 5
    'alpha': [0], # 0, 1
    'gamma': [0], # 0, 1
    'eta': [.3], # range: [0,1]
    'base_score': [.5], # .1, .5, .9
    'min_child_weight': [0], # 0, 1, 2
    'max_delta_step': [5], # 0, 1-10 larger
    'subsample': [1], # range: (0,1]
    'sample_type': ['uniform', 'weighted'], # uniform, weighted
    'normalize_type': ['tree', 'forest'], # tree, forest
    'rate_drop': [0] # 0-1
} # best: smoothing=.6: 80.7

lrc = LogisticRegression(random_state=1)
lrc_grid = {
    'C': [145, 150, 155], # 150
    'fit_intercept': [True], # True
    'solver': ['lbfgs'], # lbfgs
    'penalty': ['l2'], # l2 (l2 only: newton-cg, sag, lbfgs)
    'multi_class': ['multinomial'], # multinomial (multinomial: newton-cg, sag, saga, lbfgs)
    'class_weight': ['balanced'], # None
    'dual': [False], # False
    'max_iter': [500] # 500
} # best: smoothing=.6: 80.3

sgd = SGDClassifier(random_state=1, fit_intercept=True, penalty='l2')
sgd_grid = {
    'loss': ['log'], # log
    'alpha': [1e-6], # 1e-6
    'max_iter': [1100] # 1100
} # best: smoothing=.6: 79.5

svc = SVC(random_state=1, probability=False)
svc_grid = {
    'C': [10, 15, 20], # 40
    'kernel': ['rbf'], # rbf
    'gamma': [.5, 1], # 1.4
    'max_iter': [2000], # 2000
    'shrinking': [True], # True
    'class_weight': [None], # None
    'decision_function_shape': ['ovr'] # ‘ovr’
} # smoothing=.2: 80.8

In [None]:
test_hyperparams(svc, svc_grid, metrics.accuracy_score, X_train, y_train, X_val, y_val)

In [None]:
svc = SVC(C=20, gamma=.5, max_iter=2000, kernel='rbf', shrinking=True, class_weight=None, random_state=1, probability=False)

In [None]:
model = svc.fit(X, y)
preds = model.predict(test)

In [None]:
output = pd.DataFrame(preds, index=test.index, columns=['cuisine'])
output.head()

In [None]:
compare = pd.concat([output, answers], axis='columns')
compare.columns = ['pred', 'real']

In [None]:
errors = compare.query('pred != real')

In [None]:
len(errors)

In [None]:
# id, cuisine
output.to_csv('preds/drop_bad_it_us_smoothing2.csv', header=True, encoding='utf-8')