In [None]:
# Imports
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [None]:
cuisine = pd.read_csv('data/cuisine.csv', names=['cuisine'], header=None, index_col=0)
cuisine = cuisine.astype('category')

In [None]:
train_ings = pd.read_csv('data/phrases_cleaned_train.csv', header=0, index_col=0)
train_ings = train_ings.astype(np.uint8)

In [None]:
test = pd.read_csv('data/phrases_cleaned_test.csv', header=0, index_col=0)
test = test.astype(np.uint8)

In [None]:
train = pd.concat((cuisine, train_ings), axis=1)

In [None]:
train.head()

In [None]:
#train = pd.get_dummies(train_raw)
#test = pd.get_dummies(test_raw)
X = train.drop('cuisine', axis=1)
y = train['cuisine']

In [None]:
pct = lambda v: int(v * 100)

def test_data(X, y, title, clf, sampler=None, splits=3):
    kfold = KFold(n_splits=splits, shuffle=True)
    #avgs = []
    for train_i, test_i in kfold.split(X):
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        y_train, y_test = y.iloc[train_i], y.iloc[test_i]
        if sampler:
            X_train, y_train = sampler.fit_resample(X_train, y_train)
        model = clf.fit(X_train, y_train)
        preds = model.predict(X_test)
        #print(' '.join('{1}{0:.10}'.format(k, int(v * 100)) for v, k in sorted(zip(model.feature_importances_, X.columns), reverse=True)[:6]))
        #report = metrics.classification_report(y_test, preds, output_dict=True)
        #avgs.append((metrics.accuracy_score(y_test, preds)))
        print(metrics.accuracy_score(y_test, preds))
        print(metrics.classification_report(y_test, preds))
    #avg = [pct(sum(lst)/3) for lst in zip(*avgs)]
    #print(avg, '='*5, title, '='*5)


def tests():
    test_data(X, y, 'LRC', lrc)
    #test_data(X, y, 'SVC', svc)
    #test_data(X, y, 'SVC', lsvc)
    #test_data(X, y, 'ABC', abc)
    #test_data(X, y, 'XGBoost', xgc)
    #test_data(X, y, 'Decision Tree', dtc)
    #test_data(X, y, 'Random Forest', rfc)

In [None]:
samp = RandomOverSampler(sampling_strategy='minority')
dtce = DecisionTreeClassifier(max_depth=1, criterion='gini', min_samples_split=2, min_samples_leaf=10, class_weight=None)
dtc = DecisionTreeClassifier(max_depth=None, criterion='gini', min_samples_split=10, min_samples_leaf=10, class_weight=None)
xgc = XGBClassifier(booster='gbtree', max_depth=10, eta=.3, min_child_weight=10)
abc = AdaBoostClassifier(base_estimator=dtce, n_estimators=50, learning_rate=.1)
lrc = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=500, tol=0.0001, C=5.0, fit_intercept=False, intercept_scaling=1, class_weight=None)
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, class_weight=None)
svc = SVC()
lsvc = LinearSVC()

In [None]:
tests()

In [None]:
# Output Test
#X, y = samp.fit_resample(X, y)
model = lrc.fit(X, y)
preds = model.predict(test)

In [None]:
res = pd.DataFrame(preds, index=test.index, columns=['Survived'])
res.index.names = ['PassengerId']

In [None]:
res.to_csv('preds/preds_logreg_imb.csv', header=True, index=True)