# Stacking code for last-man-standing hackathon by AnalyticsVidhya
### Code created on : 31st Jan 2015 | Author: Bargava

Link to competition
http://datahack.analyticsvidhya.com/contest/last-man-standing

#### Solution Approach
In this, we will will split the training dataset into two. With the first half of the dataset, we will build models that will be used to predict on the second half and will be used as features, along with original features of second half of the training set to build the final classifier. 

In [None]:
#Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import StratifiedShuffleSplit
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.cluster import KMeans

In [None]:
#read train, test and samplesub datasets
train = pd.read_csv("../data/trainUpdated.csv")
test = pd.read_csv("../data/testUpdated.csv")
samplesub = pd.read_csv("../data/samplesubUpdated.csv")
label = pd.read_csv("../data/labelsUpdated.csv")

In [None]:
#Convert the inputs to numpy array
train = np.array(train)
test = np.array(test)
label = np.array(label)

In [None]:
#Find the list of indices for first layer and second layer stacking
layer_1_indices = StratifiedShuffleSplit(label, test_size=0.5, n_iter=1)

In [None]:
for train_1_index, train_2_index in layer_1_indices:
    layer_1_train, layer_2_train = train[train_1_index], train[train_2_index] 
    layer_1_label, layer_2_label = label[train_1_index], label[train_2_index]

In [None]:
#Let's check and find out proportions of the classes in train, layer_1_train and layer_2_train
#This is to see if stratified sampling has happened properly
print "train:", np.unique(label, return_counts=True)[1].astype(float)/label.shape[0]
print "layer_1_train:",  np.unique(layer_1_label, return_counts=True)[1].astype(float)/layer_1_label.shape[0]
print "layer_2_train:",  np.unique(layer_2_label, return_counts=True)[1].astype(float)/layer_2_label.shape[0]

# Layer 1 Models

# Model 1: *shallow* `xgboost`

In [None]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.5
params["colsample_bytree"] = 0.5
params["silent"] = 0
params["max_depth"] = 4
params["nthread"] = 6
params["gamma"] = 3
params["objective"] = "multi:softmax"
params["num_class"] = 3
params["verbose"] = 2
params["eta"] = 0.3
params["base_score"] = 0
params["eval_metric"] = "merror"
params["seed"] = 13

plst = list(params.items())
num_rounds = 1200

xgtrain_pv = xgb.DMatrix(np.array(layer_1_train).astype(float), label=np.array(layer_1_label).astype(float))
watchlist = [(xgtrain_pv, 'train')]

model_1 = xgb.train(plst, xgtrain_pv, num_rounds, watchlist)

# Model 2: *deep* `xgboost`

In [None]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.5
params["colsample_bytree"] = 0.5
params["silent"] = 0
params["max_depth"] = 14
params["nthread"] = 6
params["gamma"] = 3
params["objective"] = "multi:softmax"
params["num_class"] = 3
params["verbose"] = 2
params["eta"] = 0.3
params["base_score"] = 0
params["eval_metric"] = "merror"
params["seed"] = 13

plst = list(params.items())
num_rounds = 400

xgtrain_pv = xgb.DMatrix(np.array(layer_1_train).astype(float), label=np.array(layer_1_label).astype(float))
watchlist = [(xgtrain_pv, 'train')]

model_2 = xgb.train(plst, xgtrain_pv, num_rounds, watchlist)

# Model 3: *shallow* `RandomForest`

In [None]:
model_3 = RandomForestClassifier(n_estimators=1200, max_depth=4, \
                                 max_features=5, min_samples_split=10, min_samples_leaf=5, \
                                 oob_score=True, n_jobs=6)

model_3.fit(layer_1_train, np.ravel(layer_1_label))
model_3.oob_score_

# Model 4: *deep* `RandomForest`

In [None]:
model_4 = RandomForestClassifier(n_estimators=400, max_depth=14, \
                                 max_features=5, min_samples_split=10, min_samples_leaf=5, \
                                 oob_score=True, n_jobs=6)

model_4.fit(layer_1_train, np.ravel(layer_1_label))
model_4.oob_score_

# Model 5: *shallow* `ExtraTrees`

In [None]:
model_5 = ExtraTreesClassifier(n_estimators=1200, max_depth=4, \
                                 max_features=5, min_samples_split=10, min_samples_leaf=5, \
                                 oob_score=True, n_jobs=6, bootstrap=True)

model_5.fit(layer_1_train, np.ravel(layer_1_label))
model_5.oob_score_

# Model 6: *deep* `ExtraTrees`

In [None]:
model_6 = ExtraTreesClassifier(n_estimators=400, max_depth=14, \
                                 max_features=5, min_samples_split=10, min_samples_leaf=5, \
                                 oob_score=True, n_jobs=6, bootstrap=True)

model_6.fit(layer_1_train, np.ravel(layer_1_label))
model_6.oob_score_

# Model 7: `L2 Logistic Regression`

In [None]:
scaler = StandardScaler()
scaler.fit(layer_1_train.astype(float))
train_1_layer_lr = scaler.transform(layer_1_train.astype(float))
model_7 = LogisticRegression(C=0.01, penalty="l2", n_jobs=6, verbose=1)
model_7.fit(train_1_layer_lr, np.ravel(layer_1_label))

### Predictions for layer 2

In [None]:
model_1_predict = model_1.predict(xgb.DMatrix(np.array(layer_2_train).astype(float)))
model_2_predict = model_2.predict(xgb.DMatrix(np.array(layer_2_train).astype(float)))
model_3_predict = model_3.predict_proba(layer_2_train)
model_4_predict = model_4.predict_proba(layer_2_train)
model_5_predict = model_5.predict_proba(layer_2_train)
model_6_predict = model_6.predict_proba(layer_2_train)
train_2_layer_lr = scaler.transform(layer_2_train.astype(float))
model_7_predict = model_7.predict_proba(train_2_layer_lr)

In [None]:
#combine all predictions and the original features into a 
layer_2_train_consolidated = np.concatenate((layer_2_train,
                               np.column_stack((model_1_predict,
                               model_2_predict)),
                               model_3_predict,
                               model_4_predict,
                               model_5_predict,
                               model_6_predict,
                               model_7_predict), axis=1)

In [None]:
layer_2_train_consolidated.shape

# Layer 2 Model

### `xgboost` model

In [None]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.5
params["colsample_bytree"] = 0.5
params["silent"] = 0
params["max_depth"] = 8
params["nthread"] = 6
params["gamma"] = 3
params["objective"] = "multi:softmax"
params["num_class"] = 3
params["verbose"] = 2
params["eta"] = 0.3
params["base_score"] = 0
params["eval_metric"] = "merror"
params["seed"] = 13

plst = list(params.items())
num_rounds = 400

xgtrain_pv = xgb.DMatrix(np.array(layer_2_train_consolidated).astype(float), label=np.array(layer_2_label).astype(float))
watchlist = [(xgtrain_pv, 'train')]

model_layer_2 = xgb.train(plst, xgtrain_pv, num_rounds, watchlist)

# Prediction on test dataset

In [None]:
model_1_predict = model_1.predict(xgb.DMatrix(np.array(test).astype(float)))
model_2_predict = model_2.predict(xgb.DMatrix(np.array(test).astype(float)))
model_3_predict = model_3.predict_proba(test)
model_4_predict = model_4.predict_proba(test)
model_5_predict = model_5.predict_proba(test)
model_6_predict = model_6.predict_proba(test)
test_layer_lr = scaler.transform(test.astype(float))
model_7_predict = model_7.predict_proba(test_layer_lr)

#combine all predictions and the original features into a 
layer_2_test_consolidated = np.concatenate((test,
                               np.column_stack((model_1_predict,
                               model_2_predict)),
                               model_3_predict,
                               model_4_predict,
                               model_5_predict,
                               model_6_predict,
                               model_7_predict), axis=1)

layer_2_predict = model_layer_2.predict(xgb.DMatrix(np.array(layer_2_test_consolidated).astype(float)))

In [None]:
np.unique(layer_2_predict, return_counts=True)[1].astype(float)/layer_2_predict.shape[0]

In [None]:
samplesub.columns = ["ID"]
samplesub["Crop_Damage"] = model_1_predict
#Write the prediction to a csv
samplesub.to_csv("../submission/submission_31jan_2.csv", index=False)

In [None]:
#On LB : gives a score of 0.846