In [1]:
import pandas as pd, numpy as np, time, sys, h5py
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from keras.layers import Input, Dense , Dropout , TimeDistributed , LSTM , GRU, concatenate, BatchNormalization
from keras.models import Model
from keras.optimizers import SGD , Adadelta, RMSprop, Adam, Adamax
from keras.models import  load_model
from keras.callbacks import EarlyStopping
from keras.utils import  to_categorical 
from keras.regularizers import l1, l2, l1_l2
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import pickle
from sklearn.svm import SVC

In [2]:
# Initialize problem parameters
class Args:
    """ Class containing all model arguments """
    def __init__( self ):
        self.project    = 'MLchallenge_DontOverfit'
        self.dataPath   = '/home/harsh/Downloads/DontOverfit/'       .format(self.project)
        self.modelsPath = '/home/harsh/Downloads/DontOverfit/Models/' .format(self.project)
        self.resultsPath= '/home/harsh/Downloads/DontOverfit/Results/'.format(self.project)
        self.CV_folds   = 40  # split the Training data in stratified folds, to train different versions of models 
args = Args()

In [4]:
# LOAD DATA
train = pd.read_csv( args.dataPath + 'TTT_train.csv' )
test = pd.read_csv( args.dataPath + 'TTT_test_features.csv', index_col = 'ID')
print(train.describe())

                f0           f1           f2           f3           f4  \
count  1244.000000  1244.000000  1244.000000  1244.000000  1244.000000   
mean      0.000566     0.000697     0.000468     0.001733     0.000708   
std       0.019962     0.024577     0.016497     0.031072     0.024959   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     0.000000     0.000000   
50%       0.000000     0.000000     0.000000     0.000000     0.000000   
75%       0.000000     0.000000     0.000000     0.000000     0.000000   
max       0.704060     0.866833     0.581853     0.709016     0.880315   

                f5           f6           f7           f8           f9  ...  \
count  1244.000000  1244.000000  1244.000000  1244.000000  1244.000000  ...   
mean      0.000717     0.000585     0.000357     0.007151     0.000693  ...   
std       0.025296     0.020650     0.012606     0.050962     0.024434  ...   
min       0.00000

In [23]:
# Remove outlier

import numpy as np
from sklearn.ensemble import IsolationForest

isf = IsolationForest(contamination='auto', behaviour='new', n_jobs=-1)
isf.fit(train.drop('label', axis=1), train['label'])
y_train_outlier = isf.predict(train.drop('label', axis=1))
train = train[np.where(y_train_outlier == 1, True, False)]

  warn(


In [24]:
# Model Evaluation

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train.drop('label', axis=1), train["label"].values, test_size=0.3, shuffle=True)

In [25]:
#Training the model and Testing Accuracy on Validation data
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

parameters = {'penalty':['l2'], 'C': np.arange(0.05, 1.05, 0.05)}

lr = LogisticRegression(n_jobs=-1, multi_class='auto', solver='lbfgs', class_weight='balanced', max_iter=10000)
lr.fit(X_train, y_train)

clf = GridSearchCV(lr, parameters, cv=9)
clf.fit(train.drop('label', axis=1), train['label'])

mnb = MultinomialNB(alpha=0.1)
mnb.fit(X_train, y_train)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=9)
knn.fit(X_train, y_train)

rf = RandomForestClassifier(n_jobs=-1, n_estimators=100, random_state=1)
rf.fit(X_train, y_train)

svc = SVC(gamma='scale', decision_function_shape='ovo')
svc.fit(X_train, y_train)

y_val_lr = lr.predict(X_val)
print('Accuracy score: lr ', accuracy_score(y_val, y_val_lr))
y_val_clf = clf.predict(X_val)
print('Accuracy score: clf ', accuracy_score(y_val, y_val_clf))
y_val_mnb = mnb.predict(X_val)
print('Accuracy score: mnb ', accuracy_score(y_val, y_val_mnb))
y_val_gnb = gnb.predict(X_val)
print('Accuracy score: gnb ', accuracy_score(y_val, y_val_gnb))
y_val_knn = knn.predict(X_val)
print('Accuracy score: knn ', accuracy_score(y_val, y_val_knn))
y_val_rf = rf.predict(X_val)
print('Accuracy score: rf ', accuracy_score(y_val, y_val_rf))
y_val_svc = svc.predict(X_val)
print('Accuracy score: svc ', accuracy_score(y_val, y_val_svc))

Accuracy score: lr  0.7941176470588235
Accuracy score: clf  0.8850267379679144
Accuracy score: mnb  0.7165775401069518
Accuracy score: gnb  0.5374331550802139
Accuracy score: knn  0.6390374331550802
Accuracy score: rf  0.7887700534759359
Accuracy score: svc  0.7299465240641712


# Ensemble

In [26]:
X_train_all = train.drop('label', axis=1).values
y_train_all = train['label'].values

In [29]:
!pip install mlxtend  

Collecting mlxtend
  Downloading mlxtend-0.17.3-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 1.6 MB/s eta 0:00:01
Installing collected packages: mlxtend
Successfully installed mlxtend-0.17.3


In [30]:
from mlxtend.classifier import StackingCVClassifier, StackingClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression, RidgeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import xgboost as xgb

xgb = xgb.XGBClassifier(verbosity=1,
                        n_jobs=-1,
                        objective='multi:softprob', 
                        n_estimators=500,
                        max_depth=3)

params = {'meta-logisticregression__C': [0.001, 0.01, 0.1, 1, 10.0, 100]}

sc = StackingClassifier(
    classifiers=[
        LogisticRegression(penalty='l2', n_jobs=-1, multi_class='auto', solver='lbfgs', max_iter=10000),
        RandomForestClassifier(n_estimators=500, n_jobs=-1),
        SGDClassifier(loss='log', max_iter=1000, tol=1e-3)
    ],
    verbose=1,
    use_probas=True,
    meta_classifier=LogisticRegression(penalty='l2', n_jobs=-1, multi_class='auto', solver='lbfgs', max_iter=10000)
)

sc.fit(X_train_all, y_train_all)

y_val_sc = sc.predict(X_val)
print('Accuracy score: sc ', accuracy_score(y_val, y_val_sc))

Fitting 3 classifiers...
Fitting classifier1: logisticregression (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: sgdclassifier (3/3)
Accuracy score: sc  1.0


In [39]:
sc.get_params().keys()
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV

# Slight param tuning


In [35]:
'''
grid = GridSearchCV(estimator=sc, 
                    param_grid=params, 
                    cv=4)
grid.fit(X_train_all, y_train_all)

y_val_grid = grid.predict(X_val)
print('Accuracy score: grid ', accuracy_score(y_val, y_val_grid))
'''

"\ngrid = GridSearchCV(estimator=sc, \n                    param_grid=params, \n                    cv=4)\ngrid.fit(X_train_all, y_train_all)\n\ny_val_grid = grid.predict(X_val)\nprint('Accuracy score: grid ', accuracy_score(y_val, y_val_grid))\n"

# Model Evaluation

In [42]:
import sklearn
score = sklearn.model_selection.cross_val_score(sc, train.drop('label', axis=1).values, train['label'].values, cv=4, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std()))

Fitting 3 classifiers...
Fitting classifier1: logisticregression (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: sgdclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: logisticregression (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: sgdclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: logisticregression (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: sgdclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: logisticregression (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: sgdclassifier (3/3)
Accuracy: 0.78 (+/- 0.02)


# Submission

In [45]:
y_test_pred = sc.predict(test)
result = test.reset_index()[['ID']].copy()
result['label'] = y_test_pred

result.to_csv(path_or_buf= args.dataPath + 'mle_tiny_submission.csv' , encoding='utf-8', index=False, header=['ID', 'label'])