<a href="https://colab.research.google.com/github/sabyasm/ipython-notebooks/blob/master/sub_001_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U mlens

Requirement already up-to-date: mlens in /usr/local/lib/python3.6/dist-packages (0.2.3)


In [0]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.metrics import accuracy_score
import xgboost as xgb

from mlens.metrics import make_scorer
accuracy_scorer = make_scorer(accuracy_score, greater_is_better=True)


from mlens.ensemble import SuperLearner
from mlens.model_selection import Evaluator
from mlens.ensemble import SequentialEnsemble



# A host of Scikit-learn models
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
from scipy.stats import randint


# **Download Feature Data stored in pickle format**

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
download = drive.CreateFile({'id': '1KvpsAYprIzDISjY3pIltj8rTjhkBmq5C'})
download.GetContentFile('features.pkl')

#https://drive.google.com/open?id=1KvpsAYprIzDISjY3pIltj8rTjhkBmq5C

In [0]:
# Assuming pickle feats are already downloaded
import pickle
file = open('features.pkl', 'rb')
labels = [[],[]]
train_features = pickle.load(file)
test_features = pickle.load(file)
labels[0] = pickle.load(file)
labels[1] = pickle.load(file)
uid = pickle.load(file)
file.close()

In [0]:
xtrain, xtest, ytrain, ytest = train_features, test_features, np.array(labels[0]), np.array(labels[1])

In [8]:
#0.783 -produced 3 times now
seed = 2017
np.random.seed(seed)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

ensemble = SequentialEnsemble()

# The initial layer is a blended layer, same as a layer in the BlendEnsemble
ensemble.add('blend',
             [LogisticRegression(C=1), RandomForestClassifier(random_state=seed, n_jobs=-1),MLPClassifier((80, 10), early_stopping=False, random_state=seed),
             GradientBoostingClassifier(n_estimators=100, random_state=seed)
             ])

# The second layer is a stacked layer, same as a layer of the SuperLearner
ensemble.add('stack', 
             [LogisticRegression(C=1), RandomForestClassifier(random_state=seed, n_jobs=-1),MLPClassifier((80, 10), early_stopping=False, random_state=seed)])

# The third layer is a subsembled layer, same as a layer of the Subsemble
ensemble.add('subsemble', [SVC(), xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7),GaussianNB()])

# The meta estimator is added as in any other ensemble
ensemble.add_meta(RandomForestClassifier(random_state=seed))

# Fit ensemble
ensemble.fit(xtrain, ytrain)

# Predict
preds = ensemble.predict(xtest)
print("Fit data:\n%r" % ensemble.data)

Fit data:
                                             ft-m  ft-s  pt-m  pt-s
layer-1  gradientboostingclassifier  0    1115.98  0.00  0.20  0.00
layer-1  logisticregression          0      27.05  0.00  0.18  0.00
layer-1  mlpclassifier               0     201.72  0.00  0.69  0.00
layer-1  randomforestclassifier      0       3.35  0.00  0.23  0.00
layer-2  logisticregression          0       0.00  0.00  0.00  0.00
layer-2  mlpclassifier               0       0.87  0.34  0.01  0.00
layer-2  randomforestclassifier      0       0.12  0.00  0.10  0.00
layer-3  gaussiannb                  0       0.00  0.00  0.00  0.00
layer-3  gaussiannb                  1       0.00  0.00  0.00  0.00
layer-3  svc                         0       0.01  0.00  0.01  0.00
layer-3  svc                         1       0.01  0.00  0.02  0.00
layer-3  xgbclassifier               0       0.03  0.00  0.00  0.00
layer-3  xgbclassifier               1       0.03  0.00  0.00  0.00

Prediction score: 0.783


In [10]:
print("Prediction score: %.5f" % accuracy_score(preds, ytest))

Prediction score: 0.78295


In [0]:
model_name="sub_001_ensemble_783"
ts = pd.DataFrame(
{'Unique ID': uid,
 'label': preds
})
ts.to_csv(model_name+".csv",index=False)
from google.colab import files
files.download(model_name+".csv") 