In [1]:
%load_ext autoreload
%reload_ext autoreload
import numpy as np
import pandas as pd
from datetime import datetime
from functools import partial
import lightgbm as lgb
import scipy
from multiprocessing import *
from Utils.Custom_loss_functions import gini_normalized,gini_lgb,gini
from Data_Prep.data_prep import *
from Models.lgb_model import *
from Utils.Log_Driver import get_log,reinitiate_logfile
import traceback
import logging
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from math import exp, log
import xgboost as xgb
import data_prep



In [2]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        tmin, tsec = divmod((datetime.now() - start_time).total_seconds(), 60)
        logger.info(' Time taken: %i minutes and %s seconds.' %
              (tmin, round(tsec, 2)))


def scale_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [3]:
def Load_data():
    time = timer()
    train = pd.read_csv("/home/saurabhg/PuertoSergo/train.csv")
    timer(time)
    test = pd.read_csv("/home/saurabhg/PuertoSergo/test.csv")
    timer(time)
    y = train["target"]
    testid= test['id'].values
    trainid = train["id"].values
    train.drop(['id','target'],axis=1,inplace=True)
    test.drop(['id'],axis=1,inplace=True)
    return train,test,y,testid,trainid

In [4]:
def prep_data(train, test,y,rm_calc_Cols = True,reconfig_ps_reg =True,one_hot = True,target_encoding = False):
    time = timer()
    f_cats = [f for f in train.columns if "_cat" in f]
    if rm_calc_Cols:
        unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
        train = train.drop(unwanted, axis=1)  
        test = test.drop(unwanted, axis=1)
        timer(time)
        logger.info("calc fields dropped")
    if reconfig_ps_reg:
        train['ps_reg_A'] = train['ps_reg_03'].apply(lambda x: recon(x)[0])
        train['ps_reg_M'] = train['ps_reg_03'].apply(lambda x: recon(x)[1])
        train['ps_reg_A'].replace(19,-1, inplace=True)
        train['ps_reg_M'].replace(51,-1, inplace=True)
        test['ps_reg_A'] = test['ps_reg_03'].apply(lambda x: recon(x)[0])
        test['ps_reg_M'] = test['ps_reg_03'].apply(lambda x: recon(x)[1])
        test['ps_reg_A'].replace(19,-1, inplace=True)
        test['ps_reg_M'].replace(51,-1, inplace=True)
        timer(time)
        logger.info("Reconfigured PS Reg 03")
    if one_hot:
        d_median = train.median(axis=0)
        d_mean = train.mean(axis=0)
        d_skew = train.skew(axis=0)
        one_hot = {c: list(train[c].unique()) for c in train.columns if c not in ['id','target']}
        
        train = multi_transform(train,d_median,d_mean,one_hot)
        test = multi_transform(test,d_median,d_mean,one_hot)
        timer(time)
        logger.info("One hot encoded variables")
        
    if target_encoding:
        for f in f_cats:
            train[f + "_avg"],test[f + "_avg"] = target_encode(trn_series=train[f],
                                                                tst_series=test[f],
                                                                target=y,
                                                                min_samples_leaf=200,
                                                                smoothing=10,
                                                                noise_level=0)
        timer(time)
        logger.info("Target Encoding of categorical variables")
    return train,test     

In [5]:
#logger =reinitiate_logfile(logger,"First_log")

In [6]:
class KFold_ensemble():    
    def __init__(self,train= None,y=None,test= 1,trainid= None,testid = None,
                 n_splits =2,shuffle= True,random_state = 15,params = None,
                 stratify = True,classifier= "LGB",Cat_features = None):
        self.n_splits = n_splits
        self.testid = testid
        self.trainid = trainid
        self.shuffle = shuffle
        self.random_state = random_state
        self.classifier = classifier
        self.params = params
        self.train = train
        self.test = test
        self.Cat_features = Cat_features
        self.stratify=stratify
        self.y=y
        self.folds = self.Stratified_flag()
        self.model_param = self.Makemodel()
        #self.sub_preds = np.zeros(len(sub_df))
        
    def Modelselect(self):
            return (lightgbm().build_from_json(self.params)).lgb_params()
        
        
    def get_kfold_parameters(self,clf):
        if self.n_splits>1:
            self.imp_df = np.zeros((len(self.train.columns), self.n_splits))
            self.evals = np.zeros((clf.n_estimators, self.n_splits))
            self.oof = np.empty(len(train))
            
    
    def Stratified_flag(self):
        if self.stratify == True:
            return StratifiedKFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state ) 
        else:
            return KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state ) 
        
    def test_available(self):
        if not self.test.empty:
            self.sub_preds = np.zeros((len(self.test),self.n_splits))
            print(1)
        else:
            self.sub_preds =None
            
    def Makemodel(self):
        for fold_, (trn_idx, val_idx) in enumerate(self.folds.split(self.y, self.y)):
            trn_dat, trn_tgt = self.train.iloc[trn_idx], self.y.iloc[trn_idx]
            val_dat, val_tgt = self.train.iloc[val_idx], self.y.iloc[val_idx]
            logger.info (fold_)
            if self.classifier=="LGB":
                clf=(lightgbm().build_from_json(self.params)).lgb_params()
                train_params  = lgb_train().build_from_json(self.params)
                if train_params.eval_set:
                    train_params.set_eval_set(trn_dat,trn_tgt,val_dat,val_tgt)
                    if self.Cat_features:
                        train_params.Cat_features(Cat_features) 
                clf = train_params.fit(clf,trn_dat,trn_tgt)
                if fold_==0:
                    self.get_kfold_parameters(clf)
                    self.test_available()
                # Find best round for validation setA
                self.evals[:, fold_] = clf.evals_result_["valid_1"][clf.evals_result_["valid_1"].keys()[0]]
                # Keep feature importances
                self.imp_df[:, fold_] = clf.feature_importances_
                # Predict OOF and submission probas with the best round
                best_round = np.argsort(self.evals[:, fold_])[::-1][0]
                 # Update submission
                self.oof[val_idx] = clf.predict(val_dat, num_iteration=best_round)
                if not self.test.empty:
                    self.sub_preds[:, fold_] = clf.predict(self.test, num_iteration=best_round)
#                 logger.info("Fold %2d : %.6f @%4d / best score is %.6f @%4d"
#                                                             % (fold_ + 1,
#                                                             gini_normalized(val_tgt, self.oof[val_idx]),
#                                                             clf.n_estimators,
#                                                             len(self.evals),
#                                                             best_round))
    def variable_importance(self):
        importances = sorted([(self.train.columns[i], imp) for i, imp in enumerate(self.imp_df.mean(axis=1))],
                            key=lambda x: x[1])
        logger.info("printing variable importance......")
        for f, imp in importances[::-1]:
            logger.info("%-34s : %10.4f" % (f, imp))
                
    def submission_file(self,name = "Testsubmission"):
        sub = pd.DataFrame()
        sub['id'] = self.testid
        print self.oof
        sub["target"] = scipy.stats.hmean(self.sub_preds,axis=1)
        sub.to_csv(name+".csv", index=False, float_format="%.9f")
        logger.info("CSV written to %s ......"%name)
        

In [7]:
params  ={
    "tuning_parameters":{
        "n_estimators" : 100
    },
    "train_parameters":{
        "eval_set" : True,
        "early_stopping_rounds" :25,
        "verbose":20,
        "eval_metric" : gini_lgb
    }
}


In [8]:
if __name__ == "__main__":
    logger = get_log("First_log")
    train,test,y,testid,trainid = Load_data()
    f_cats = [f for f in train.columns if "_cat" in f]
    train,test = prep_data(train,test,y,
                           rm_calc_Cols= True,
                           reconfig_ps_reg =False,
                           one_hot = False,
                           target_encoding =True )
    lgb_ensemble = KFold_ensemble(train,y,test,trainid=trainid,testid=testid,params=params)
    lgb_ensemble.variable_importance()
    lgb_ensemble.submission_file("dummy")
    
    

INFO:__info__: Time taken: 0 minutes and 3.78 seconds.
INFO:__info__: Time taken: 0 minutes and 9.28 seconds.
INFO:__info__: Time taken: 0 minutes and 0.16 seconds.
INFO:__info__:calc fields dropped
INFO:__info__: Time taken: 0 minutes and 4.47 seconds.
INFO:__info__:Target Encoding of categorical variables
INFO:__info__:0


1
Train until valid scores didn't improve in 25 rounds.
[20]	valid_0's gini_lgb: 0.267908	valid_1's gini_lgb: 0.237953
[40]	valid_0's gini_lgb: 0.273105	valid_1's gini_lgb: 0.241591
[60]	valid_0's gini_lgb: 0.277401	valid_1's gini_lgb: 0.244207
[80]	valid_0's gini_lgb: 0.278675	valid_1's gini_lgb: 0.245414
[100]	valid_0's gini_lgb: 0.279425	valid_1's gini_lgb: 0.246362

INFO:__info__:1



1
Train until valid scores didn't improve in 25 rounds.
[20]	valid_0's gini_lgb: 0.269243	valid_1's gini_lgb: 0.244828
[40]	valid_0's gini_lgb: 0.274557	valid_1's gini_lgb: 0.248061
[60]	valid_0's gini_lgb: 0.279064	valid_1's gini_lgb: 0.251659
[80]	valid_0's gini_lgb: 0.280586	valid_1's gini_lgb: 0.252492
[100]	valid_0's gini_lgb: 0.282923	valid_1's gini_lgb: 0.254003

INFO:__info__:printing variable importance......
INFO:__info__:ps_car_13                          :     0.0965
INFO:__info__:ps_reg_03                          :     0.0959
INFO:__info__:ps_ind_17_bin                      :     0.0654
INFO:__info__:ps_ind_03                          :     0.0575
INFO:__info__:ps_car_11_cat_avg                  :     0.0544
INFO:__info__:ps_ind_05_cat_avg                  :     0.0526
INFO:__info__:ps_reg_02                          :     0.0519
INFO:__info__:ps_ind_15                          :     0.0491
INFO:__info__:ps_ind_05_cat                      :     0.0365
INFO:__info__:ps_car_01_cat_avg                  :     0.0356
INFO:__info__:ps_reg_01                          :     0.0346
INFO:__info__:ps_ind_01                          :     0.0312
INFO:__info__:ps_car_14                          :     0.0243
INFO:__info__:ps_ind_07_bin                      :     0.0240
INFO:__info__:ps_car_07_cat                      :     0.0234
INFO:__info__:ps_car_


[ 0.21873235  0.20278264  0.20244026 ...,  0.20352998  0.20773393
  0.20672752]

INFO:__info__:CSV written to dummy ......





In [9]:
lgb_ensemble.sub_preds[:,0]

array([ 0.20016314,  0.20223653,  0.20535973, ...,  0.20963646,
        0.2055928 ,  0.2038792 ])

In [10]:
test.empty

False