In [None]:
# imports

# extra
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.svm import SVC

# local
from model_helpers import *

# display settings
pd.set_option('display.max_columns', 21)
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 1600)

In [None]:
train, test = load_data()

In [None]:
X = train.drop(columns=['cuisine'])
y = train['cuisine']

In [None]:
sample_sizes = {
    'italian': 8000, # 7838
    'mexican': 6500, # 6438
    'southern_us': 5000, # 4320
    'indian': 3100, # 3003
    'chinese': 3000, # 2673
    'french': 3000, # 2646
    'cajun_creole': 1900, # 1546
    'thai': 1900, # 1539
    'japanese': 1900, # 1423
    'greek': 1400, # 1175
    'spanish': 1200, # 989
    'korean': 1000, # 830
    'vietnamese': 1000, # 825
    'moroccan': 1000, # 821
    'british': 1000, # 804
    'filipino': 1000, # 755
    'irish': 900, # 667
    'jamaican': 800, # 526
    'russian': 800, # 489
    'brazilian': 800, # 467
}
smote = SMOTE(sampling_strategy=sample_sizes, k_neighbors=20, random_state=1, n_jobs=-1)

In [None]:
def reweight_features(features):
    reweighted = features.copy()
    reweighted[:, 0:40] *= 3
    reweighted[:, 60:100] *= 2.5
    return reweighted

In [None]:
svc = SVC(kernel='rbf', shrinking=True, class_weight=None, max_iter=-1, random_state=1, cache_size=350, probability=False)
lrc = LogisticRegression(solver='newton-cg', penalty='l2', multi_class='multinomial', fit_intercept=True, class_weight=None, random_state=1, n_jobs=-1)
rfc = RandomForestClassifier(max_depth=None, n_estimators=600, criterion='gini', class_weight=None, random_state=1)
knc = KNeighborsClassifier(weights='distance', metric='euclidean', algorithm='auto', n_jobs=-1)
pipe_grid = {
    #'svc__C': [5],
    #'svc__gamma': [.02, .1],
    #'lrc__C': [1, 50],
    #'rfc__min_samples_split': [10],
    #'rfc__min_samples_leaf': [2],
    #'knc__n_neighbors': [30, 50],
}
pipe = Pipeline([
    ('scale', MinMaxScaler()),
    ('reweight', FunctionTransformer(reweight_features)),
    ('smote', smote),
    ('bc', bc)
])
search = GridSearchCV(pipe, pipe_grid, cv=3, scoring='accuracy', return_train_score=True, refit=False, n_jobs=-1, verbose=2)
search.fit(X, y)
search_results = pd.DataFrame.from_dict(search.cv_results_, orient='columns')
search_results[['params', 'mean_test_score', 'mean_train_score']].sort_values('mean_test_score')

In [None]:
estimators = {
    'svc': SVC(C=1, gamma=.1, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True, cache_size=300),
    'lrc': LogisticRegression(C=20, solver='newton-cg', penalty='l2', multi_class='multinomial', class_weight=None, fit_intercept=True, max_iter=200),
    'rfc': RandomForestClassifier(min_samples_split=30, min_samples_leaf=12, n_estimators=800, max_depth=None, class_weight=None, criterion='gini'),
    'knc': KNeighborsClassifier(n_neighbors=20, metric='euclidean', p=2, weights='distance', n_jobs=-1),
}
overfit_estimators = {
    'svc': SVC(C=5, gamma=.02, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True, cache_size=300),
    'lrc': LogisticRegression(C=1, solver='newton-cg', penalty='l2', multi_class='multinomial', class_weight=None, fit_intercept=True, max_iter=200),
    'rfc': RandomForestClassifier(min_samples_split=10, min_samples_leaf=2, n_estimators=800, max_depth=None, class_weight=None, criterion='gini'),
    'knc': KNeighborsClassifier(n_neighbors=30, metric='euclidean', p=2, weights='distance', n_jobs=-1),
}
voting = VotingClassifier([(k, v) for k, v in overfit_estimators.items()], voting='soft', n_jobs=-1)

In [None]:
pipe = Pipeline([
    ('scale', MinMaxScaler()),
    ('reweight', FunctionTransformer(reweight_features)),
    ('smote', smote),
    ('voting', voting)
])
model = pipe.fit(X, y)
print(model.score(X, y))
preds = model.predict(test)
output = pd.DataFrame(preds, index=test.index, columns=['cuisine'])

In [None]:
compare = pd.concat([output, answers, baseline], axis='columns')
compare.columns = ['pred', 'real', 'base']
errors = compare.query('pred != real')
changes = compare.query('pred != base')
len(errors), len(changes) # 1313, 1296, 1286, 1280, 1251

In [None]:
output.to_csv('preds/scaled_reweighted.csv', header=True, encoding='utf-8')