## Read the data 

In [0]:
import pandas as pd
from pathlib import Path

path = Path('/Users/hwaaikke/mle/tiny')
train = pd.read_csv(path/'data/TTT_train.csv', header=0)
test = pd.read_csv(path/'data/TTT_test_features.csv', index_col = 'ID', header=0)

## Remove outlier

In [0]:
import numpy as np
from sklearn.ensemble import IsolationForest

isf = IsolationForest(contamination='auto', behaviour='new', n_jobs=-1)
isf.fit(train.drop('label', axis=1), train['label'])
y_train_outlier = isf.predict(train.drop('label', axis=1))
train = train[np.where(y_train_outlier == 1, True, False)]

## Model Evaluation

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train.drop('label', axis=1), train["label"].values, test_size=0.3, shuffle=True)

In [0]:
#Training the model and Testing Accuracy on Validation data
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

parameters = {'penalty':['l2'], 'C': np.arange(0.05, 1.05, 0.05)}

lr = LogisticRegression(n_jobs=-1, multi_class='auto', solver='lbfgs', class_weight='balanced', max_iter=10000)
lr.fit(X_train, y_train)

clf = GridSearchCV(lr, parameters, cv=9)
clf.fit(train.drop('label', axis=1), train['label'])

mnb = MultinomialNB(alpha=0.1)
mnb.fit(X_train, y_train)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=9)
knn.fit(X_train, y_train)

rf = RandomForestClassifier(n_jobs=-1, n_estimators=100, random_state=1)
rf.fit(X_train, y_train)

svc = SVC(gamma='scale', decision_function_shape='ovo')
svc.fit(X_train, y_train)

y_val_lr = lr.predict(X_val)
print('Accuracy score: lr ', accuracy_score(y_val, y_val_lr))
y_val_clf = clf.predict(X_val)
print('Accuracy score: clf ', accuracy_score(y_val, y_val_clf))
y_val_mnb = mnb.predict(X_val)
print('Accuracy score: mnb ', accuracy_score(y_val, y_val_mnb))
y_val_gnb = gnb.predict(X_val)
print('Accuracy score: gnb ', accuracy_score(y_val, y_val_gnb))
y_val_knn = knn.predict(X_val)
print('Accuracy score: knn ', accuracy_score(y_val, y_val_knn))
y_val_rf = rf.predict(X_val)
print('Accuracy score: rf ', accuracy_score(y_val, y_val_rf))
y_val_svc = svc.predict(X_val)
print('Accuracy score: svc ', accuracy_score(y_val, y_val_svc))

## Ensemble

Note: You can try running eider.experimental.pip_import('mlxtend') to import the library necessary to run the code below. There were some issues when I tried importing it, which is why I used a jupyter notebook instead of eider in the end.

In [0]:
X_train_all = train.drop('label', axis=1).values
y_train_all = train['label'].values

In [0]:
from mlxtend.classifier import StackingCVClassifier, StackingClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression, RidgeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import xgboost as xgb

xgb = xgb.XGBClassifier(verbosity=1,
                        n_jobs=-1,
                        objective='multi:softprob', 
                        n_estimators=500,
                        max_depth=3)

params = {'meta-logisticregression__C': [0.001, 0.01, 0.1, 1, 10.0, 100]}

sc = StackingClassifier(
    classifiers=[
        LogisticRegression(penalty='l2', n_jobs=-1, multi_class='auto', solver='lbfgs', max_iter=10000),
        RandomForestClassifier(n_estimators=500, n_jobs=-1),
        SGDClassifier(loss='log', max_iter=1000, tol=1e-3)
    ],
    verbose=1,
    use_probas=True,
    meta_classifier=LogisticRegression(penalty='l2', n_jobs=-1, multi_class='auto', solver='lbfgs', max_iter=10000)
)

sc.fit(X_train_all, y_train_all)

y_val_sc = sc.predict(X_val)
print('Accuracy score: sc ', accuracy_score(y_val, y_val_sc))

## Slight param tuning

In [0]:
grid = GridSearchCV(estimator=sc, 
                    param_grid=params, 
                    cv=4)
grid.fit(X_train_all, y_train_all)

y_val_grid = grid.predict(X_val)
print('Accuracy score: grid ', accuracy_score(y_val, y_val_grid))

## Model Evaluation

In [0]:
score = model_selection.cross_val_score(sc, train.drop('label', axis=1).values, train['label'].values, cv=4, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std()))

# __Submission__

In [0]:
y_test_pred = sc.predict(test)
result = test.reset_index()[['ID']].copy()
result['label'] = y_test_pred

result.to_csv(path_or_buf='/Users/hwaaikke/mle/tiny/hwaaikke_mle_tiny_submission.csv', encoding='utf-8', index=False, header=['ID', 'label'])