In [None]:
# imports

# extra
import pandas as pd
from parfit import bestFit
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# local
from model_helpers import *

In [None]:
train, test = load_data()

In [None]:
X = train.drop(columns=['cuisine'])
y = train['cuisine']

In [None]:
easy_classes = {
    'brazilian': 'west',
    'british': 'west',
    'cajun_creole': 'west',
    'chinese': 'east',
    'filipino': 'east',
    'french': 'west',
    'greek': 'west',
    'indian': 'east',
    'irish': 'west',
    'italian': 'west',
    'jamaican': 'west',
    'japanese': 'east',
    'korean': 'east',
    'mexican': 'west',
    'moroccan': 'west',
    'russian': 'west',
    'southern_us': 'west',
    'spanish': 'west',
    'thai': 'east',
    'vietnamese': 'east'
}
y_easy = y.map(lambda cuisine: easy_classes[cuisine])

In [None]:
svc = SVC(shrinking=True, max_iter=-1, random_state=1)
svc_grid = {
    'C': [4, 6, 8],
    'gamma': [.1, .3],
    'kernel': ['rbf'],
    'class_weight': [None]
}
lrc = LogisticRegression(fit_intercept=True, random_state=1, n_jobs=-1)
lrc_grid = {
    'C': [1, 10, 50, 100],
    'solver': ['newton-cg'],
    'penalty': ['l2'], # (l2 only: newton-cg, sag, lbfgs)
    'multi_class': ['multinomial'],
    'class_weight': [None],
}
rfc = RandomForestClassifier(max_depth=None, random_state=1)
rfc_grid = {
    'min_samples_split': [10, 30],
    'min_samples_leaf': [6, 10],
    'n_estimators': [600],
    'class_weight': [None],
    'criterion': ['gini']
}
knc = KNeighborsClassifier(n_jobs=-1, algorithm='auto')
knc_grid = {
    'n_neighbors': [5, 10, 20],
    'p': [2],
    'metric': ['euclidean'],
    'weights': ['distance'],
}
dtcabc = DecisionTreeClassifier(max_depth=1, criterion='gini', min_samples_split=2, min_samples_leaf=2, class_weight=None)
abc = AdaBoostClassifier(base_estimator=dtcabc)
abc_grid = {
    'n_estimators': [50, 500], # 60
    'learning_rate': [.1, 1, 10] # .5
}

In [None]:
best_params, best_score, all_params, all_scores = bestFit(svc, ParameterGrid(svc_grid), X, y, nfolds=3,
                                                          metric=metrics.accuracy_score, greater_is_better=True, scoreLabel='Acc')
print(best_params, best_score)

In [None]:
estimators_easy = {
    'svc': SVC(C=100, gamma=.1, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True),
    'lrc': LogisticRegression(C=100, solver='newton-cg', penalty='l2', multi_class='multinomial', class_weight=None, fit_intercept=True),
    'rfc': RandomForestClassifier(min_samples_split=10, min_samples_leaf=6, n_estimators=800, max_depth=None, class_weight=None, criterion='gini'),
    'knc': KNeighborsClassifier(n_neighbors=6, metric='euclidean', p=2, weights='distance', n_jobs=-1)
}
clf_easy = VotingClassifier([(k, v) for k, v in estimators_easy.items()], voting='soft', n_jobs=-1)

In [None]:
model_easy = clf_easy.fit(X, y_easy)
print(model_easy.score(X, y_easy))
preds_easy = pd.DataFrame(model_easy.predict(test), index=test.index, columns=['cuisine'])

In [None]:
east_test_i = preds_easy.query('cuisine == "east"').index
west_test_i = preds_easy.query('cuisine == "west"').index
test_east, test_west = test.loc[east_test_i], test.loc[west_test_i]

In [None]:
y_east, y_west = y[y_easy == 'east'], y[y_easy == 'west']
X_east, X_west = X.loc[y_east.index], X.loc[y_west.index]

In [None]:
estimators_east = {
    'svc': SVC(C=8, gamma=.1, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True),
    'lrc': LogisticRegression(C=50, solver='newton-cg', penalty='l2', multi_class='multinomial', class_weight=None, fit_intercept=True),
    'rfc': RandomForestClassifier(min_samples_split=10, min_samples_leaf=6, n_estimators=800, max_depth=None, class_weight=None, criterion='gini'),
    'knc': KNeighborsClassifier(n_neighbors=15, metric='euclidean', p=2, weights='distance', n_jobs=-1)
}
clf_east = VotingClassifier([(k, v) for k, v in estimators_east.items()], voting='soft', n_jobs=-1)

In [None]:
model_east = clf_east.fit(X_east, y_east)
print(model_east.score(X_east, y_east))
preds_east = pd.DataFrame(model_east.predict(test_east), index=test_east.index, columns=['cuisine'])

In [None]:
estimators_west = {
    'svc': SVC(C=8, gamma=.3, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True),
    'lrc': LogisticRegression(C=10, solver='newton-cg', penalty='l2', multi_class='multinomial', class_weight=None, fit_intercept=True),
    'rfc': RandomForestClassifier(min_samples_split=10, min_samples_leaf=6, n_estimators=800, max_depth=None, class_weight=None, criterion='gini'),
    'knc': KNeighborsClassifier(n_neighbors=20, metric='euclidean', p=2, weights='distance', n_jobs=-1)
}
clf_west = VotingClassifier([(k, v) for k, v in estimators_west.items()], voting='soft', n_jobs=-1)

In [None]:
model_west = clf_west.fit(X_west, y_west)
print(model_west.score(X_west, y_west))
preds_west = pd.DataFrame(model_west.predict(test_west), index=test_west.index, columns=['cuisine'])

In [None]:
output = pd.concat([preds_east, preds_west], axis='index')

In [None]:
estimators = {
    'svc': SVC(C=6, gamma=.3, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True),
    'lrc': LogisticRegression(C=50, solver='newton-cg', penalty='l2', multi_class='multinomial', class_weight=None, fit_intercept=True),
    'rfc': RandomForestClassifier(min_samples_split=10, min_samples_leaf=6, n_estimators=800, max_depth=None, class_weight=None, criterion='gini'),
    'knc': KNeighborsClassifier(n_neighbors=10, metric='euclidean', p=2, weights='distance', n_jobs=-1)
}
clf = VotingClassifier([(k, v) for k, v in estimators.items()], voting='soft', n_jobs=-1)

In [None]:
model = clf.fit(X, y)
print(model.score(X, y))
output = pd.DataFrame(model.predict(test), index=test.index, columns=['cuisine'])

In [None]:
probs = pd.DataFrame(model.predict_proba(test), index=test.index, columns=model.classes_)
max_probs = probs.max(axis='columns')
high_test = test.loc[max_probs[max_probs > .95].index]
high_test_preds = pd.Series(model.predict(test), index=test.index).loc[high_test.index]
X_new = pd.concat([X, high_test], axis='index')
y_new = pd.concat([y, high_test_preds], axis='index')
model2 = clf.fit(X_new, y_new)
print(model2.score(X_new, y_new))
output2 = pd.DataFrame(model2.predict(test), index=test.index, columns=['cuisine'])

In [None]:
compare = pd.concat([output, output_easy, answers], axis='columns')
compare.columns = ['pred', 'pred2', 'real']
errors = compare.query('pred != real or pred2 != real')
len(errors)

In [None]:
errors

In [None]:
compare = pd.concat([output2, answers], axis='columns')
compare.columns = ['pred', 'real']
errors = compare.query('pred != real')
len(errors) # 1391, easy:1357, 1333, 1319

In [None]:
# id, cuisine
output2.to_csv('preds/stacked_iter.csv', header=True, encoding='utf-8')