<a href="https://colab.research.google.com/github/sabyasm/manu_data_competition/blob/master/02_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mlens for Ensemble and viz of model correlation

In [1]:
!pip install -U mlens

Collecting mlens
[?25l  Downloading https://files.pythonhosted.org/packages/0b/f7/c04bda423ac93ddb54bc4c3a21c79c9a24bc83844efc30dc4c11c289e894/mlens-0.2.3-py2.py3-none-any.whl (227kB)
[K    4% |█▍                              | 10kB 16.2MB/s eta 0:00:01[K    8% |██▉                             | 20kB 4.5MB/s eta 0:00:01[K    13% |████▎                           | 30kB 6.3MB/s eta 0:00:01[K    17% |█████▊                          | 40kB 4.0MB/s eta 0:00:01[K    22% |███████▏                        | 51kB 4.9MB/s eta 0:00:01[K    26% |████████▋                       | 61kB 5.7MB/s eta 0:00:01[K    31% |██████████                      | 71kB 6.4MB/s eta 0:00:01[K    35% |███████████▌                    | 81kB 7.2MB/s eta 0:00:01[K    40% |█████████████                   | 92kB 7.9MB/s eta 0:00:01[K    44% |██████████████▍                 | 102kB 6.5MB/s eta 0:00:01[K    49% |███████████████▉                | 112kB 6.6MB/s eta 0:00:01[K    53% |█████████████████▎  

In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.metrics import accuracy_score
import xgboost as xgb

from mlens.metrics import make_scorer
accuracy_scorer = make_scorer(accuracy_score, greater_is_better=True)


from mlens.ensemble import SuperLearner
from mlens.model_selection import Evaluator
from mlens.ensemble import SequentialEnsemble

# A host of Scikit-learn models
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
from scipy.stats import randint


[MLENS] backend: threading


# **Download Feature Data stored in pickle format**

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Change the id value from last notebook - alternatively use mine as is

In [0]:
download = drive.CreateFile({'id': '1xGT1h3XgWEmZTEkDPMpKIyK7odRNCqy6'})
download.GetContentFile('final_features.pkl')

#https://drive.google.com/open?id=1xGT1h3XgWEmZTEkDPMpKIyK7odRNCqy6 -- this is the url of my gdrive location. you can use this or create your own

In [0]:
# Assuming pickle feats are already downloaded
import pickle
file = open('final_features.pkl', 'rb')
labels = [[],[]]
train_features = pickle.load(file)
test_features = pickle.load(file)
labels[0] = pickle.load(file)
labels[1] = pickle.load(file)
uid = pickle.load(file)
file.close()

In [0]:
xtrain, xtest, ytrain = train_features, test_features, np.array(labels[0])

# Ensemble of non highly correlated base models - each results 73-77% accuracy


---

Note: This step takes around 1hr-1 hr 20 mins.

In [9]:
# stable CV - Adam confirmed 79.07% Acc - 5th Nov submission

%%time
seed = 2017
np.random.seed(seed)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

ensemble = SequentialEnsemble()

# The initial layer is a blended layer, same as a layer in the BlendEnsemble
ensemble.add('blend',
             [LogisticRegression(C=1), 
              RandomForestClassifier(random_state=seed, n_jobs=-1),
              MLPClassifier((80, 10), early_stopping=False, random_state=seed,verbose=1),
              GradientBoostingClassifier(n_estimators=100, random_state=seed)
             ])

# The second layer is a stacked layer, same as a layer of the SuperLearner
ensemble.add('stack', 
             [LogisticRegression(C=1), 
              RandomForestClassifier(random_state=seed, n_jobs=-1),
              MLPClassifier((80, 10), early_stopping=False, random_state=seed,verbose=1),
              xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7),
             ])

# The third layer is a subsembled layer, same as a layer of the Subsemble
ensemble.add('subsemble', [SVC(), xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7),GaussianNB(),LogisticRegression(C=1)])

# The meta estimator is added as in any other ensemble
ensemble.add_meta(RandomForestClassifier(random_state=seed))

# Fit ensemble
ensemble.fit(xtrain, ytrain)

# Predict
preds = ensemble.predict(xtest)
print("Fit data:\n%r" % ensemble.data)

Iteration 1, loss = 0.57237954
Iteration 2, loss = 0.48625287
Iteration 3, loss = 0.46510122
Iteration 4, loss = 0.43994066
Iteration 5, loss = 0.41058972
Iteration 6, loss = 0.38506532
Iteration 7, loss = 0.36606947
Iteration 8, loss = 0.33814708
Iteration 9, loss = 0.31588283
Iteration 10, loss = 0.28227581
Iteration 11, loss = 0.26030913
Iteration 12, loss = 0.23270564
Iteration 13, loss = 0.20569402
Iteration 14, loss = 0.18669746
Iteration 15, loss = 0.15554527
Iteration 16, loss = 0.14285135
Iteration 17, loss = 0.12167131
Iteration 18, loss = 0.10843283
Iteration 19, loss = 0.08802905
Iteration 20, loss = 0.07835036
Iteration 21, loss = 0.07718902
Iteration 22, loss = 0.06067011
Iteration 23, loss = 0.05008399
Iteration 24, loss = 0.04408593
Iteration 25, loss = 0.03842513
Iteration 26, loss = 0.03402232
Iteration 27, loss = 0.03139835
Iteration 28, loss = 0.02618692
Iteration 29, loss = 0.02402492
Iteration 30, loss = 0.02140145
Iteration 31, loss = 0.01961303
Iteration 32, los

# **Generate Prediction**

In [0]:
model_name="sub_002_ensemble"
ts = pd.DataFrame(
{'Unique ID': uid,
 'label': preds
})
ts.to_csv(model_name+".csv",index=False)


In [0]:
from google.colab import files
files.download("sub_002_ensemble.csv") 

# Optional - Save prediction to google drive (in case runtime dies)

In [0]:
upload = drive.CreateFile({'title': 'sub_002_ensemble.csv'})
upload.SetContentFile('sub_002_ensemble.csv')
upload.Upload()