In [None]:
# imports

# extra
import pandas as pd
from parfit import bestFit
from sklearn import metrics
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# local
from model_helpers import *

In [None]:
train, test = load_data()

In [None]:
X = train.drop(columns=['cuisine'])
y = train['cuisine']

In [None]:
svc = SVC(shrinking=True, max_iter=-1, random_state=1)
svc_grid = {
    'C': [6, 8, 10, 12],
    'gamma': [.6, 1.2],
    'kernel': ['rbf'],
    'class_weight': [None]
}
lrc = LogisticRegression(fit_intercept=True, random_state=1, n_jobs=-1)
lrc_grid = {
    'C': [1, 10, 100, 300],
    'solver': ['newton-cg'],
    'penalty': ['l2'], # (l2 only: newton-cg, sag, lbfgs)
    'multi_class': ['multinomial'],
    'class_weight': [None],
}
rfc = RandomForestClassifier(max_depth=None, random_state=1)
rfc_grid = {
    'min_samples_split': [10],
    'min_samples_leaf': [6],
    'n_estimators': [600],
    'class_weight': [None],
    'criterion': ['gini']
}
knc = KNeighborsClassifier(n_jobs=-1, algorithm='auto')
knc_grid = {
    'n_neighbors': [5, 15, 25],
    'p': [2],
    'metric': ['euclidean'],
    'weights': ['distance'],
}

In [None]:
best_params, best_score, all_params, all_scores = bestFit(lrc, ParameterGrid(lrc_grid), X, y, nfolds=3,
                                                          metric=metrics.accuracy_score, greater_is_better=True, scoreLabel='Acc')
print(best_params, best_score)

In [None]:
estimators = {
    'svc': SVC(C=6, gamma=.6, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True),
    'lrc': LogisticRegression(C=80, solver='newton-cg', penalty='l2', multi_class='multinomial', class_weight=None, fit_intercept=True),
    'rfc': RandomForestClassifier(min_samples_split=10, min_samples_leaf=6, n_estimators=800, max_depth=None, class_weight=None, criterion='gini'),
    'knc': KNeighborsClassifier(n_neighbors=25, metric='euclidean', p=2, weights='distance', n_jobs=-1)
}
clf = VotingClassifier([(k, v) for k, v in estimators.items()], voting='soft', n_jobs=-1)

In [None]:
clf = SVC(C=6, gamma=.6, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True)

In [None]:
model = clf.fit(X, y)
preds = model.predict(test)
output = pd.DataFrame(preds, index=test.index, columns=['cuisine'])
model.score(X, y)

In [None]:
compare = pd.concat([output, answers], axis='columns')
compare.columns = ['pred', 'real']
errors = compare.query('pred != real')
len(errors) # 1497, 1422

In [None]:
# id, cuisine
output.to_csv('preds/stacked.csv', header=True, encoding='utf-8')