In [395]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

data_train= pd.read_csv('train.csv')
data_test= pd.read_csv('test.csv')
data_all= pd.concat([data_train, data_test])
data_all = data_all.drop(['Survived'], axis=1)
m_train = data_train.shape[0]
m_test = data_test.shape[0]

In [396]:
#preprocessing
from sklearn import preprocessing

def Simplify_cabins(data):
    data.Cabin = data.Cabin.fillna('N')
    data.Cabin = data.Cabin.apply(lambda x: x[0])
    return data

def Add_Fare(data):
    data.Fare = data.Fare.fillna(0)
    if len(data.Fare[data.Fare==0])>0:
        fare = np.zeros(3)
        for i in range(0,3):
            fare[i] = data[data['Pclass'] == i+1]['Fare'].dropna().median()
        for i in range(0,3):
            data.loc[(data.Fare==0)&(data['Pclass']==i+1), 'Fare'] = fare[i]
    return data

def NameSuffix(data):
    data['LastName'] = data['Name'].apply(lambda x: x.split(',')[0])
    data['FirstName'] = data['Name'].apply(lambda x: x.split(',')[1])
    data['NameSuffix'] = data['FirstName'].apply(lambda x: x.split('.')[0])
    return data

def Encode_features(data):
    e_features = ['Cabin', 'Embarked',  'Sex']
    for feature in e_features:
        data[feature] = data[feature].fillna(0)
        le = preprocessing.LabelEncoder().fit(data[feature])
        data[feature] = le.transform(data[feature])
    return data

def Feature_preprocess(data):
    data = NameSuffix(data)
    data = Add_Fare(data)
    data = Simplify_cabins(data)
    data = Encode_features(data)
    return data

data_all = Feature_preprocess(data_all)

#Simplify name suffix: suffix with less than 10 counts simplified based sex, family size, and title.
namesuffix = {' Col': 'Sld', ' Don': 'Mrs', ' Mme': 'Mrs', ' Major': 'Sld', ' Lady': 'Mrs', ' Sir': 'Mr', 
              ' Mlle': 'Miss', ' the Countess': 'Mrs', ' Jonkheer': 'Mr', ' Capt': 'Mr', ' Mr': 'Mr', ' Mrs': 'Mrs',
             ' Miss': 'Miss', ' Master': 'Master', ' Rev': 'Rev', ' Dr': 'Dr', ' Ms': 'Miss', ' Dona': 'Mrs'}
data_all["NameSuffix"] = data_all["NameSuffix"].map(namesuffix)

#Add features FamilySize and IsAlone
data_all['FamilySize'] = data_all['SibSp'] + data_all['Parch'] + 1

# data_all['IsAlone'] = 0
# data_all.loc[data_all['FamilySize']==1, 'IsAlone'] = 1

In [393]:
##Imputer the missing values
#Missing Fare
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

#Missing Embarked
data_embarked = data_all[['Embarked', 'Cabin', 'Fare', 'Pclass', 'Sex', 'FamilySize']]

data_embarked_exist = data_embarked.loc[(data_embarked.Embarked!=0)]
data_embarked_null = data_embarked.loc[(data_embarked.Embarked==0)]
x_embarked = data_embarked_exist.iloc[:, 1:]
y_embarked = data_embarked_exist.iloc[:, 0]
rfc_e = RandomForestClassifier(n_estimators=100)
rfc_e.fit(x_embarked, y_embarked)
y_embarked_hat = rfc_e.predict(data_embarked_null.iloc[:, 1:])
data_all.loc[data_all.Embarked==0, 'Embarked'] = y_embarked_hat

#Missing Age
data_age = data_all[['Age', 'Pclass', 'Sex', 'Fare', 'FamilySize']]
data_age_exist = data_age.loc[(data_all.Age.notnull())]
data_age_null = data_age.loc[(data_all.Age.isnull())]
x_age = data_age_exist.values[:, 1:]
y_age = data_age_exist.values[:, 0]

rfr = RandomForestRegressor(n_estimators=100)
rfr.fit(x_age, y_age)
y_hat_age = rfr.predict(data_age_null.values[:,1:])
data_all.loc[(data_all.Age.isnull()), 'Age'] = y_hat_age

#Missing Cabin
data_cabin = data_all[['Cabin', 'Pclass', 'Sex', 'Fare', 'FamilySize', 'Embarked', 'Age']]
data_cabin_exist = data_cabin.loc[(data_cabin.Cabin!=7)]
data_cabin_null = data_cabin.loc[(data_cabin.Cabin==7)]
x_cabin = data_cabin_exist.iloc[:, 1:]
y_cabin = data_cabin_exist.iloc[:, 0]
rfc_c = RandomForestClassifier(n_estimators=100)
rfc_c.fit(x_cabin, y_cabin)
y_cabin_hat = rfc_c.predict(data_cabin_null.iloc[:, 1:])
data_all.loc[data_all.Cabin==7, 'Cabin'] = y_cabin_hat

data_all.loc[data_all['Cabin']==8,'Cabin'] = data_all['Cabin'].mode()[0]

In [397]:
#Features transform
def bin_ages(data):
    bins=(0,5,12,20,30,55,120)
    group_names=["Baby", "Child", "Teenage", "Young Adult", "Adult", "Senior"]
    categories = pd.cut(data.Age, bins, labels=group_names)
    data.Age=categories
    return data

def bin_fares(data):
    bins = (0,7.9,14.4,31,1000)
    group_names=['1_q', '2_q', '3_q', '4_q']
    categories = pd.cut(data.Fare, bins, labels=group_names)
    data.Fare = categories
    return data

def encode_features(data):
    e_features = ['Age', 'Fare']
    for feature in e_features:
        le = preprocessing.LabelEncoder().fit(data[feature])
        data[feature] = le.transform(data[feature])
    return data

def drop_features(data):
    return data.drop(['Ticket', 'Name', 'SibSp', 'Parch', 'FirstName', 'LastName', 'Cabin'], axis=1)

def transform_feature(data):
    data = bin_ages(data)
    data = bin_fares(data)
    data = drop_features(data)
    #data = encode_features(data)
    return data
 
data_all = transform_feature(data_all)

#Create interaction features: pclass_sex, pclass_familysize, pclass_age
# data_all['Pclass_Sex'] = data_all['Pclass'].astype(str) + data_all['Sex'].astype(str)
# data_all['Pclass_FamilySize'] = data_all['Pclass'].astype(str) + data_all['FamilySize'].astype(str)
#data_all['Pclass_Age'] = data_all['Pclass'].astype(str) + data_all['Age'].astype(str)

dummy_features = ['Embarked', 'NameSuffix', 'Age', 'Fare']

for feature in dummy_features:
    data_all[feature] = data_all[feature].apply(str)

data_all = pd.get_dummies(data_all)

In [300]:
# #collinearity among features: VIF

# from statsmodels.stats.outliers_influence import variance_inflation_factor

# def calculate_VIF(X, threshold):
#     cols = X.columns
#     variables = np.arange(X.shape[1])
#     dropped=True
#     while dropped:
#         dropped=False
#         c = X[cols[variables]].values
#         vif = [variance_inflation_factor(c, idx) for idx in range(c.shape[1])]
#         print vif
#         max_idx = vif.index(max(vif))
#         if max(vif) > threshold:
#             print ("dropped feature:", X[cols[variables]].columns[max_idx], 'at index', str(max_idx))
#             variables = np.delete(variables, max_idx)
#             dropped=True

#     print "remaining variables:", X.columns[variables]
#     return X[cols[variables]]
# data_all = calculate_VIF(data_all,10)

In [398]:
#splitting up the training data
from sklearn.model_selection import train_test_split

x = data_all.iloc[:m_train,:].drop(['PassengerId'], axis=1)
y = data_train['Survived']

data_test = data_all.iloc[m_train:,:].drop(['PassengerId'], axis=1)

num_test = 0.3

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = num_test, random_state=23)

accuracy_dict = {}

In [399]:
#LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(C=1e3, penalty='l2', max_iter=100, fit_intercept=False, solver='liblinear',
                        n_jobs=-1, random_state=0)
lr = lr.fit(x_train, y_train)
y_hat_lr = lr.predict(x_test)
accuracy_dict['LogisticRegression'] = accuracy_score(y_test, y_hat_lr)
print lr
print accuracy_score(y_test, y_hat_lr)

LogisticRegression(C=1000.0, class_weight=None, dual=False,
          fit_intercept=False, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
0.8059701492537313


In [412]:
#Decision tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini', max_depth=3, max_features=None, min_samples_split=3, min_samples_leaf=2,
                            random_state=0)

dt = dt.fit(x_train, y_train)
y_hat_dt = dt.predict(x_test)
accuracy_dict['DecisionTree'] = accuracy_score(y_test, y_hat_dt)
print dt
print accuracy_dict['DecisionTree']

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')
0.8246268656716418


In [378]:
#RandomForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
import eli5

rfc = RandomForestClassifier(n_jobs = -1)

rfc = RandomForestClassifier(n_estimators=100, max_depth=5, criterion='gini', max_features='sqrt', min_samples_split=2, n_jobs=-1, verbose=1)
# parameters = {'n_estimators':[100,50], 
#               'max_features':['auto'],
#               'criterion': ['gini'],
#               'max_depth': [2,3,5],
#               'min_samples_split': [2,3],
#               'min_samples_leaf': [1,2]
#              } 
# acc_scorer = make_scorer(accuracy_score)

# rfc = GridSearchCV(rfc, parameters, cv=3, scoring = acc_scorer, n_jobs = -1, verbose = 1)
rfc = rfc.fit(x_train, y_train)
print rfc.feature_importances_
y_hat_rfc = rfc.predict(x_test)
accuracy_dict['RandomForest'] = accuracy_score(y_test, y_hat_rfc)
print rfc
print accuracy_dict['RandomForest']
# eli5.show_weights(rfc, feature_names=list(x_train.columns))

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished


[0.13520887 0.37782965 0.08562976 0.02239871 0.02254523 0.01395401
 0.00594902 0.0100111  0.01266106 0.00794333 0.04319542 0.00993308
 0.01170539 0.01261566 0.02351762 0.00135794 0.01764268 0.10066107
 0.08222105 0.00109207 0.00192729]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)
0.8246268656716418


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [None]:
x_test_copy = x_test
x_test_copy['y_actual'] = y_test
x_test_copy['y_hat_rfc'] = y_hat_rfc
x_test_copy['miss_classified'] = x_test_copy['y_actual'] - x_test_copy['y_hat_rfc']
x_test_copy.to_csv('x_test_rfc.csv')

In [370]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=100, max_features='sqrt', max_depth=5, criterion='gini', random_state=0, n_jobs=-1,verbose=1)

# params = {'n_estimators': [100,1000,10000], 'max_features': ['sqrt', 'log2'], 'max_depth': [2,3,5],
#           'min_samples_split': [2,3,5], 'min_samples_leaf': [1,2,3]}

# etc = GridSearchCV(etc, params, cv=3, n_jobs=-1, verbose=1)
etc=etc.fit(x_train, y_train)
#print etc.feature_importances_
y_hat_etc=etc.predict(x_test)
accuracy_dict['ExtraTrees'] = accuracy_score(y_test, y_hat_etc)

print etc
print accuracy_dict['ExtraTrees']

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished


[0.12547344 0.35998483 0.03660822 0.02303087 0.02033133 0.01021307
 0.00258779 0.0048605  0.0107459  0.00634028 0.05126883 0.00833641
 0.00643645 0.01067546 0.02670783 0.00144168 0.01698544 0.15038597
 0.12418472 0.00128177 0.00211921]
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=5, max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=0, verbose=1, warm_start=False)
0.8171641791044776


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [372]:
x_train.columns 

Index([u'Pclass', u'Sex', u'FamilySize', u'IsAlone', u'Age_Baby', u'Age_Child',
       u'Age_Teenage', u'Age_Young Adult', u'Age_Adult', u'Age_Senior',
       u'Embarked_1', u'Embarked_2', u'Fare_2_q', u'Fare_3_q', u'Fare_4_q',
       u'NameSuffix_Dr', u'NameSuffix_Master', u'NameSuffix_Miss',
       u'NameSuffix_Mrs', u'NameSuffix_Rev', u'NameSuffix_Sld'],
      dtype='object')

In [373]:
#Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

gbc = GradientBoostingClassifier(learning_rate=0.02, n_estimators=100, max_depth=3, max_features = 'auto', min_samples_split=3, loss='deviance')

# params = {'loss': ['deviance'], 'learning_rate': [0.02, 0.1, 0.2], 'n_estimators': [100, 1000], 'max_depth': [2,3,5],
#           'min_samples_split': [2,3]}
# gbc = GridSearchCV(gbc, params, cv=3, n_jobs=-1, verbose=1)
gbc = gbc.fit(x_train, y_train)
#print gbc.feature_importances_
y_hat_gbc = gbc.predict(x_test)
accuracy_dict['GradientBoosting'] = accuracy_score(y_test, y_hat_gbc)
print gbc
print accuracy_dict['GradientBoosting']

[0.15915073 0.51734977 0.09985942 0.         0.01918804 0.0055361
 0.00369718 0.00248064 0.01057819 0.02334828 0.02097423 0.02557982
 0.02149662 0.02251591 0.         0.         0.05835859 0.
 0.         0.         0.00988647]
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.02, loss='deviance', max_depth=3,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=3,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
0.8171641791044776


In [374]:
#AdaBoost

from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(learning_rate=0.2, n_estimators=500, random_state=0)
ada = ada.fit(x_train, y_train)
#print ada.feature_importances_
y_hat_ada = ada.predict(x_test)

accuracy_dict['AdaBoost'] = accuracy_score(y_test, y_hat_ada)
print ada
print accuracy_dict['AdaBoost']

[0.082 0.086 0.2   0.026 0.056 0.01  0.06  0.044 0.03  0.022 0.048 0.06
 0.042 0.01  0.034 0.018 0.088 0.032 0.044 0.004 0.004]
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.2, n_estimators=500, random_state=0)
0.8022388059701493


In [None]:
#SVM
from sklearn.svm import SVC

svc = SVC(kernel='rbf', gamma='auto', C=1)

svc = svc.fit(x_train, y_train)

y_hat_svc = svc.predict(x_test)

accuracy_dict['SVC'] = accuracy_score(y_test, y_hat_svc)
print svc
print accuracy_dict['SVC']

In [376]:
#XGBoost
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgbc = xgb.XGBClassifier(base_score=0.5, max_depth=2, learning_rate=0.6, n_estimators=1000, objective = 'binary:logistic', 
                        booster = 'gbtree', reg_alpha=0.2, reg_lambda=1, random_state=0)
# params = {'max_depth': [2,3,6], 'learning_rate': [0.05, 0.1, 0.2], 'n_estimators': [100,1000], 
#           'objective': ['binary:logistic'], 'booster': ['gbtree'], 'reg_alpha': [0.1, 0.2]}
# xgbc = GridSearchCV(xgbc, params, n_jobs=-1)
xgbc = xgbc.fit(x_train, y_train)
print xgbc.feature_importances_
y_hat_xgbc = xgbc.predict(x_test)
accuracy_dict['XGBoost'] = accuracy_score(y_test, y_hat_xgbc)
print xgbc
print accuracy_score(y_test, y_hat_xgbc)

[0.1091703  0.08005822 0.22561863 0.         0.01892285 0.00727802
 0.02620087 0.04512373 0.02328967 0.01746725 0.06259097 0.04512373
 0.04512373 0.09170306 0.05822416 0.         0.08296943 0.05094614
 0.01018923 0.         0.        ]
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.6, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0.2, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
0.8171641791044776


In [377]:
x_train.columns

Index([u'Pclass', u'Sex', u'FamilySize', u'IsAlone', u'Age_Baby', u'Age_Child',
       u'Age_Teenage', u'Age_Young Adult', u'Age_Adult', u'Age_Senior',
       u'Embarked_1', u'Embarked_2', u'Fare_2_q', u'Fare_3_q', u'Fare_4_q',
       u'NameSuffix_Dr', u'NameSuffix_Master', u'NameSuffix_Miss',
       u'NameSuffix_Mrs', u'NameSuffix_Rev', u'NameSuffix_Sld'],
      dtype='object')

In [345]:
accuracy_dict

{'AdaBoost': 0.8022388059701493,
 'ExtraTrees': 0.8171641791044776,
 'GradientBoosting': 0.8171641791044776,
 'LogisticRegression': 0.7947761194029851,
 'RandomForest': 0.8208955223880597,
 'SVC': 0.8246268656716418}

In [287]:
#Define a class for running the first level base models

class BaseModelClassifier(object):
    def __init__(self, clf, seed=0, params = None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def fit(self, x, y):
        return self.clf.fit(x,y)
            
    def predict(self, x):
        return self.clf.predict(x)
    
#     def feature_importance(self, x, y):
#         print (self.clf.fit(x,y).feature_importances_)
    

In [351]:
#base models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb

lg = BaseModelClassifier(LogisticRegression, seed=0, params=lg_params)
rf = BaseModelClassifier(RandomForestClassifier, seed=0, params=rf_params)
gb = BaseModelClassifier(GradientBoostingClassifier, seed=0, params=gb_params)
ada = BaseModelClassifier(AdaBoostClassifier, seed=0, params=ada_params)
svm = BaseModelClassifier(SVC, seed=0, params=svm_params)
xgb = BaseModelClassifier(XGBClassifier, seed=0, params=xgb_params)
lgb = BaseModelClassifier(lgb.LGBMClassifier, seed=0, params=lgb_params)
et = BaseModelClassifier(ExtraTreesClassifier, seed=0, params=et_params)

In [352]:
#Out-of-fold predictions: to avoid overfitting and as stacking train data
from sklearn.cross_validation import KFold
from scipy import stats

nfolds = 5
kf = KFold(m_train, n_folds = nfolds, random_state=0)
def OOFold(clf, x_train, y_train, x_test):
    OOF_train = np.zeros((m_train,))
    OOF_test = np.zeros((m_test,))
    OOF_test_kf = np.zeros((nfolds, m_test))
    
    for i, (train_index, test_index) in enumerate(kf):
        x_kf_train = x_train.iloc[train_index]
        y_kf_train = y_train.iloc[train_index]
        
        x_kf_test = x_train.iloc[test_index]
        clf.fit(x_kf_train, y_kf_train)
        
        OOF_train[test_index] = clf.predict(x_kf_test)
        
        OOF_test_kf[i, :] = clf.predict(x_test)
        
    OOF_test_mean = OOF_test_kf.mean(axis=0)
    OOF_test_kf_df = pd.DataFrame(data=OOF_test_kf)
    OOF_test_mode = OOF_test_kf_df.mode(axis=0)
    OOF_test_mode = OOF_test_mode.values

    
    return OOF_train, OOF_test_mean, OOF_test_mode
    

In [353]:
models = [lg, rf, gb, ada, svm, xgb, lgb, et]

def get_oof_prediction(models):
    nrows_train = data_train.shape[0]
    nrows_test = data_test.shape[0]
    oof_train_pred = np.zeros((nrows_train, len(models)))
    oof_test_mean_pred = np.zeros((nrows_test, len(models)))
    oof_test_mode_pred = np.zeros((nrows_test, len(models)))
    for i in range(len(models)):
        model = models[i]
        oof_train_pred[:, i], oof_test_mean_pred[:, i], oof_test_mode_pred[:, i] = OOFold(model, x, y, data_test)
    oof_train_pred = pd.DataFrame(data=oof_train_pred)
    oof_test_mean_pred = pd.DataFrame(data=oof_test_mean_pred)
    oof_test_mode_pred = pd.DataFrame(data=oof_test_mode_pred)
    return oof_train_pred, oof_test_mean_pred, oof_test_mode_pred
        
oof_train, oof_test_mean, oof_test_mode = get_oof_prediction(models)

columns = {0: 'lg', 1: 'rf', 2: 'gb', 3: 'ada', 4: 'svm', 5: 'xgb', 6: 'lgb', 7: 'et'}
oof_train = oof_train.rename(columns=columns)
oof_test_mean = oof_test_mean.rename(columns=columns)
oof_test_mode = oof_test_mode.rename(columns=columns)

accuracy_base_oof = {}
for col in oof_train.columns.tolist():
    accuracy_base_oof[col]= accuracy_score(y, oof_train[col])

print accuracy_base_oof

{'lg': 0.819304152637486, 'svm': 0.8215488215488216, 'lgb': 0.8170594837261503, 'xgb': 0.8294051627384961, 'rf': 0.8148148148148148, 'gb': 0.8271604938271605, 'ada': 0.813692480359147, 'et': 0.8069584736251403}


In [354]:
import plotly.plotly as py
import plotly.graph_objs as go
plotly.tools.set_credentials_file(username='rzhou1', api_key='rGZu418WMZgckbLCiEtm')

data = [go.Heatmap(z=oof_train.astype(float).corr().values,
                  y=oof_train.columns.values,
                  x=oof_train.columns.values,
                  showscale=True)]
py.iplot(data, filename='oof_train_corr')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~rzhou1/0 or inside your plot.ly account where it is named 'oof_train_corr'


In [381]:
#LightGBM
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

lgb = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary',random_state=0)

params = {'learning_rate': [0.05, 0.1, 0.2], 'n_estimators': [100,500,1000,5000], 'num_leaves': [5,10,31], 
          'max_depth': [2,3,5,8], 'min_child_samples': [3,5,8]}
lgb = GridSearchCV(lgb, params, cv=3, verbose=1, n_jobs=-1)
lgb.fit(oof_train, y)
y_stack_hat_lgb = lgb.predict(oof_test_mean)

y_stack_hat_lgb = pd.DataFrame(data=y_stack_hat_lgb)
y_stack_hat_lgb = pd.DataFrame(data={'PassengerId': data_all.iloc[m_train:, 0], 'Survived': y_stack_hat_lgb.iloc[:,0]})
y_stack_hat_lgb.to_csv('sub_stack1_sel6_lgb.csv', index=False)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 283 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 533 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 883 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  3.1min finished


In [None]:
#Stacking packages

# https://github.com/vecxoz/vecstack - a Python package for stacking 
# https://github.com/rasbt/mlxtend - Machine learning extension package with stacking 
# https://github.com/mpearmain/gestalt - Data pipeline package with stacking features 
# https://github.com/ndemir/stacking - Python helper functions and examples.