In [1]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import KFold, train_test_split
from sklearn import *
from scipy import *

from scipy.stats import skew
from scipy.stats.stats import pearsonr
from scipy.optimize import minimize_scalar

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer

import re

# Going to use these 5 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold;

from sklearn.decomposition import TruncatedSVD



In [2]:
from xgboost import XGBClassifier

from sklearn import datasets
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import confusion_matrix

iris = datasets.load_iris()
X = iris.data
y = iris.target

clf = XGBClassifier()
pred = cross_val_predict(clf, X, y)
cm = confusion_matrix(y, pred)
print(cm)

[[50  0  0]
 [ 0 47  3]
 [ 0  2 48]]


In [3]:
# Load the list of documents
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
all_data = pd.concat((train.loc[:,'ps_ind_01':'ps_calc_20_bin'],
                      test.loc[:,'ps_ind_01':'ps_calc_20_bin']))

In [None]:
#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [5]:
all_data = pd.get_dummies(all_data)

In [6]:
all_data = all_data.fillna(all_data.mean())

In [7]:
svd = TruncatedSVD(n_components=30, n_iter=7, random_state=42)
svd.fit(all_data)  

TruncatedSVD(algorithm='randomized', n_components=30, n_iter=7,
       random_state=42, tol=0.0)

In [8]:
all_data = svd.transform(all_data)

In [9]:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y_train = train['target']

In [None]:
# LightGBM parameters
denom = 0
fold = 5 #Change to 5, 1 for Kaggle Limits
for i in range(fold):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'auc'},
        #'num_class': 9,
        'learning_rate': 0.01,
        'max_depth': 28,
        'num_leaves': 136,
        'min_data_in_leaf': 8,
        'lambda_l1': 0.02,
        'num_iteration': 2000,
        'feature_fraction': 0.8, 
        'bagging_fraction': 0.8, 
        'bagging_freq': 5,
        'colsample_bytree': 0.3542618105439753
        #'verbose': 0
    }
    X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_train, y_train, test_size=0.18, random_state=i)
        
    lgb_train = lgb.Dataset(X_train_cv, y_train_cv)
    lgb_eval = lgb.Dataset(X_test_cv, y_test_cv, reference=lgb_train)

# train
    gbm = lgb.train(params,
            lgb_train,
            #num_boost_round=rnds,
            valid_sets=lgb_eval,
            verbose_eval=10,
            early_stopping_rounds=20)
    
    if denom != 0:
        pred = gbm.predict(X_test, num_iteration=gbm.best_iteration+50)      
        preds += pred
    else:
        pred = gbm.predict(X_test, num_iteration=gbm.best_iteration+50)
        preds = pred.copy()
    denom += 1
    #submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    #submission = submission.reset_index()
    #submission.columns = ["ID","class1","class2","class3","class4","class5","class6","class7","class8","class9"]
    #submission['ID'] = pid
    #submission.to_csv('submission_xgb_fold_'  + str(i) + '.csv', index=False)
    
preds /= denom
submission = pd.read_csv("sample_submission.csv")
submission['target'] = preds
submission.columns = ["id","target"]
submission.to_csv("sub_LGBM.csv", index=False)
#submission['ID'] = pid
#y_pred = gbm.predict(X_test_vect_lgbm, num_iteration=gbm.best_iteration)
#y_pred = np.argmax(y_pred, axis=1)

In [None]:
def gini(truth, predictions):
    g = np.asarray(np.c_[truth, predictions, np.arange(len(truth)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(truth) + 1) / 2.
    return gs / len(truth)

def gini_xgb(predictions, truth):
    truth = truth.get_label()
    return 'gini', -1.0 * gini(truth, predictions) / gini(truth, truth)

def gini_lgb(truth, predictions):
    score = gini(truth, predictions) / gini(truth, truth)
    return 'gini', score, True

def gini_sklearn(truth, predictions):
    return gini(truth, predictions) / gini(truth, truth)

gini_scorer = make_scorer(gini_sklearn, greater_is_better=True, needs_proba=True)

In [None]:
def objective(params):
    params = {
        'num_leaves': int(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'max_depth': int(params['max_depth']),
        'min_data_in_leaf': int(params['min_data_in_leaf']),
        'lambda_l1': '{:.3f}'.format(params['lambda_l1']),
    }
    
    clf = lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.01,
        **params
    )
    
    score = cross_val_score(clf, X_train, y_train, scoring=gini_scorer, cv=StratifiedKFold(5)).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return score

space = {
    'num_leaves': hp.quniform('num_leaves', 100, 200, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'max_depth': hp.quniform('max_depth', 5, 30, 2),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 20, 1),
    'lambda_l1': hp.uniform('lambda_l1', 0.01, 0.05),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=30)

In [None]:
print("Hyperopt estimated optimum {}".format(best))

### LGBM optimum v1
Hyperopt estimated optimum {'colsample_bytree': 0.3542618105439753, 'lambda_l1': 0.020025402620244666, 'max_depth': 28.0, 'min_data_in_leaf': 8.0, 'num_leaves': 136.0}

### Xgboost optimization

In [None]:
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    clf = xgb.XGBClassifier(
        n_estimators=250,
        learning_rate=0.05,
        nthread=4,
        **params
    )
    
    score = cross_val_score(clf, X_train, y_train, scoring=gini_scorer, cv=StratifiedKFold(5)).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return(score)

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

In [None]:
print("Hyperopt estimated optimum {}".format(best))

### xgboost optimum v1
Hyperopt estimated optimum {'colsample_bytree': 0.42220114349224486, 'gamma': 0.22144103013844213, 'max_depth': 6.0}

In [None]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']),
    }
    clf = RandomForestClassifier(
        n_jobs=4, 
        class_weight='balanced', 
        **params
    )
    
    score = cross_val_score(clf, X_train, y_train, scoring=gini_scorer, cv=StratifiedKFold()).mean()
    
    print("Gini {:.3f} params {}".format(score, params))
    
    return(score)

space = {
    'n_estimators': hp.quniform('n_estimators', 25, 500, 25),
    'max_depth': hp.quniform('max_depth', 1, 10, 1)
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

## Stucking

In [10]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
#ntrain = 1000
#ntest = 1000
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

class XgbWrapper2(object):
    def __init__(self, clf, seed=0, params=None):
        params['seed'] = seed
        self.clf = clf(**params)

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
        
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, X_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

class LgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.n_estimators = params.pop('n_estimators', 500)

    def train(self, X_train, y_train):
        lgb_train = lgb.Dataset(X_train, y_train)
        self.gbdt = lgb.train(self.param, lgb_train, self.n_estimators)

    def predict(self, x):
        return self.gbdt.predict(x)
    
# Class to extend XGboost classifer

In [11]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [12]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

#Xgboosting parameters
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.05,
    'objective': 'binary:logistic',
    'max_depth': 7,
    #'num_parallel_tree': 1,
    'min_child_weight': 1,
    #'eval_metric': 'auc',
    'n_estimators': 350
}

# Light GBM parameters
lgb_params = {
    'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'auc'},
        #'num_class': 9,
        'learning_rate': 0.01,
        'max_depth': 28,
        'num_leaves': 136,
        'min_data_in_leaf': 8,
        'lambda_l1': 0.02,
        'num_iteration': 2000,
        'feature_fraction': 0.8, 
        'bagging_fraction': 0.8, 
        'bagging_freq': 5,
        #'colsample_bytree': 0.3542618105439753
    }

In [13]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)
xg = XgbWrapper2(clf=xgb.XGBClassifier, seed=SEED, params=xgb_params)
lg = SklearnHelper(clf=lgb.LGBMClassifier, seed=SEED, params=lgb_params)
#xg = XgbWrapper(seed=SEED, params=xgb_params) 
#lg = LgbWrapper(seed=SEED, params=lgb_params)

In [14]:
y_train = y_train.ravel()
x_train = X_train # Creates an array of the train data
x_test = X_test # Creats an array of the test data

In [None]:
# Create our OOF train and test predictions. These base results will be used as new features
#et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
#rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
#ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
#gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
#svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier
xg_oof_train, xg_oof_test = get_oof(xg,x_train, y_train, x_test) # Support Vector Classifier
lg_oof_train, lg_oof_test = get_oof(lg,x_train, y_train, x_test) # Support Vector Classifier

print("Training is complete")

In [None]:
x_train = np.concatenate((xg_oof_train, lg_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, lg_oof_test), axis=1)

In [None]:
# LightGBM parameters
denom = 0
fold = 5 #Change to 5, 1 for Kaggle Limits
for i in range(fold):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'auc'},
        #'num_class': 9,
        'learning_rate': 0.01,
        'max_depth': 28,
        'num_leaves': 136,
        'min_data_in_leaf': 8,
        'lambda_l1': 0.02,
        'num_iteration': 2000,
        'feature_fraction': 0.8, 
        'bagging_fraction': 0.8, 
        'bagging_freq': 5,
        'colsample_bytree': 0.3542618105439753
        #'verbose': 0
    }
    X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(x_train, y_train, test_size=0.18, random_state=i)
        
    lgb_train = lgb.Dataset(X_train_cv, y_train_cv)
    lgb_eval = lgb.Dataset(X_test_cv, y_test_cv, reference=lgb_train)

# train
    gbm = lgb.train(params,
            lgb_train,
            #num_boost_round=rnds,
            valid_sets=lgb_eval,
            verbose_eval=10,
            early_stopping_rounds=20)
    
    if denom != 0:
        pred = gbm.predict(x_test, num_iteration=gbm.best_iteration+50)      
        preds += pred
    else:
        pred = gbm.predict(x_test, num_iteration=gbm.best_iteration+50)
        preds = pred.copy()
    denom += 1
    #submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    #submission = submission.reset_index()
    #submission.columns = ["ID","class1","class2","class3","class4","class5","class6","class7","class8","class9"]
    #submission['ID'] = pid
    #submission.to_csv('submission_xgb_fold_'  + str(i) + '.csv', index=False)
    
preds /= denom
submission = pd.read_csv("sample_submission.csv")
submission['target'] = preds
submission.columns = ["id","target"]
submission.to_csv("sub_LGBM.csv", index=False)
#submission['ID'] = pid
#y_pred = gbm.predict(X_test_vect_lgbm, num_iteration=gbm.best_iteration)
#y_pred = np.argmax(y_pred, axis=1)