In [None]:
# imports

# standard
from functools import reduce

# extra
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.svm import SVC

# local
from feature_helpers import *

# display settings
pd.set_option('display.max_columns', 21)
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 1600)

In [None]:
def load_data():
    cuisine = pd.read_csv('data/cuisine.csv', names=['cuisine'], header=None, index_col=0)
    cuisine = cuisine.astype(np.unicode_)
    train_ings = pd.read_csv('data/temp_train.csv', header=0, index_col=0)
    train_ings = train_ings.astype(np.float64)
    train = pd.concat((cuisine, train_ings), axis=1)
    test = pd.read_csv('data/temp_test.csv', header=0, index_col=0)
    test = test.astype(np.float64)
    return (train, test)

train, test = load_data()

In [None]:
def reweight_features(features):
    reweighted = features.copy()
    reweighted[:, 0:40] *= 3
    reweighted[:, 60:100] *= 2.5
    return reweighted

scale = MinMaxScaler()
reweight = FunctionTransformer(reweight_features, validate=False)

In [None]:
X_scores = train.drop(columns=['cuisine'])
y = train['cuisine']
X_scores = scale.fit_transform(X_scores)
test_scores = scale.transform(test)
X_scores = reweight.fit_transform(X_scores)
test_scores = reweight.transform(test_scores)

In [None]:
svc = SVC(C=5, gamma=.02, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True, cache_size=350)
lrc = LogisticRegression(C=1, solver='newton-cg', penalty='l2', multi_class='multinomial', class_weight=None, fit_intercept=True, max_iter=200)
rfc = RandomForestClassifier(min_samples_split=10, min_samples_leaf=2, n_estimators=800, max_depth=None, class_weight=None, criterion='gini')
knc = KNeighborsClassifier(n_neighbors=30, metric='euclidean', p=2, weights='distance', n_jobs=-1)

In [None]:
probs_scores = []
for clf in [svc, lrc, rfc, knc]:
    clf.fit(X_scores, y)
    print(clf.score(X_scores, y))
    probs = pd.DataFrame(clf.predict_proba(test_scores), columns=clf.classes_, index=test.index)
    probs_scores.append(probs)

In [None]:
recipes = load_clean_data()

In [None]:
recipes.strings = remove_dupes(recipes.strings)
recipes.strings = recipes.strings.map(lambda ings: ' '.join(ings))

In [None]:
X_tfidfs = recipes.query('cuisine != "test"').strings
test_tfidfs = recipes.query('cuisine == "test"').strings

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=1.0, min_df=1, token_pattern=r'[\w\-_]+',
                             norm='l2', binary=True, use_idf=True, smooth_idf=True, sublinear_tf=False,
                             analyzer='word', encoding='utf-8', strip_accents=None, 
                             stop_words=None, max_features=None, lowercase=False)

In [None]:
X_tfidfs = vectorizer.fit_transform(X_tfidfs)
test_tfidfs = vectorizer.transform(test_tfidfs)

In [None]:
svc_tfidfs = SVC(C=5, gamma=.8, kernel='rbf', class_weight=None, shrinking=True, max_iter=-1, probability=True, cache_size=350)
lrc_tfidfs = LogisticRegression(C=1, solver='newton-cg', penalty='l2', multi_class='multinomial', class_weight=None, fit_intercept=True, max_iter=200)

In [None]:
probs_tfidfs = []
for clf in [svc_tfidfs, lrc_tfidfs]:
    clf.fit(X_tfidfs, y)
    print(clf.score(X_tfidfs, y))
    probs = pd.DataFrame(clf.predict_proba(test_tfidfs), columns=clf.classes_, index=test.index)
    probs_tfidfs.append(probs)

In [None]:
probs_sum = reduce(lambda x, y: x.add(y, fill_value=0.0), probs_tfidfs)
preds_softmax = probs_sum.idxmax(axis='columns')
output = pd.DataFrame(preds_softmax, columns=['cuisine'])

In [None]:
probs_features = []
for clf in [svc, lrc, rfc, knc]:
    print('patience...')
    probs = pd.DataFrame(clf.predict_proba(X_scores), columns=clf.classes_, index=train.index)
    probs_features.append(probs)
for clf in [svc_tfidfs, lrc_tfidfs]:
    print('patience...')
    probs = pd.DataFrame(clf.predict_proba(X_tfidfs), columns=clf.classes_, index=train.index)
    probs_features.append(probs)

In [None]:
X_probs = pd.concat(probs_features, axis='columns')
test_probs = pd.concat(probs_scores + probs_tfidfs, axis='columns')

In [None]:
svc = SVC(kernel='rbf', shrinking=True, class_weight=None, max_iter=-1, random_state=1, cache_size=350, probability=False)
lrc = LogisticRegression(penalty='l2', multi_class='multinomial', fit_intercept=True, class_weight=None, random_state=1, n_jobs=-1)
param_grid = {
    #'C': [1, 5, 10],
    #'gamma': [.1, 1],
    'solver': ['newton-cg', 'lbfgs'],
    'C': [1, 10, 50],
} # svc c=1,g=.1
search = GridSearchCV(lrc, param_grid, cv=3, scoring='accuracy', return_train_score=True, refit=False, n_jobs=-1, verbose=10)
search.fit(X_probs, y)
search_results = pd.DataFrame.from_dict(search.cv_results_, orient='columns')
search_results[['params', 'mean_test_score', 'mean_train_score']].sort_values('mean_test_score')

In [None]:
stack_svc = SVC(C=1, gamma=.1, kernel='rbf', shrinking=True, class_weight=None, max_iter=-1, random_state=1, cache_size=350, probability=False)
stack_lrc = LogisticRegression(C=1, solver='newton-cg', penalty='l2', multi_class='multinomial', fit_intercept=True, class_weight=None, random_state=1, n_jobs=-1)

In [None]:
X_probs = pd.concat([probs_features[0], probs_features[4]], axis='columns')
test_probs = pd.concat([probs_scores[0], probs_tfidfs[0]], axis='columns')

In [None]:
model = stack_lrc.fit(X_probs, y)
print(model.score(X_probs, y))
preds = model.predict(test_probs)
output = pd.DataFrame(preds, index=test.index, columns=['cuisine'])

In [None]:
answers = pd.read_csv('data/submission.csv', header=0, index_col=0)
baseline = pd.read_csv('data/baseline.csv', header=0, index_col=0)

In [None]:
compare = pd.concat([output, answers, baseline], axis='columns')
compare.columns = ['pred', 'real', 'base']
errors = compare.query('pred != real')
changes = compare.query('pred != base')
print(len(errors), len(changes)) # 1251, 1058

In [None]:
labels = sorted(errors.real.unique())
cnf = confusion_matrix(errors.real, errors.pred, labels=labels)
plot_cnf(cnf, labels)
falpos_counts, falneg_counts = errors.pred.value_counts(), errors.real.value_counts()
pd.concat([falpos_counts, falneg_counts, falpos_counts + falneg_counts, output.cuisine.value_counts()], axis='columns', sort=False)

In [None]:
output.to_csv('preds/tfidf_stack_lrc_1v1.csv', header=True, encoding='utf-8')