In [1]:
import os
import pprint
import pickle
import scipy
import numpy as np
import pandas as pd
import warnings as ws
ws.filterwarnings('ignore')

#Sklearn functions
from contextlib import contextmanager
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#Gradient Boosters
from catboost import CatBoostClassifier
import xgboost as xgb

#Hyper parameter optimzation options
from hyperopt import Trials, tpe, hp, fmin, STATUS_OK
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
from time import time, ctime

In [2]:
times = ['time%s' % i for i in range(1,11)]
train_df = pd.read_csv('../../../datasets/alice/train_sessions.csv', index_col='session_id', parse_dates=times)
test_df = pd.read_csv('../../../datasets/alice/test_sessions.csv', index_col='session_id', parse_dates=times)
train_df = train_df.sort_values(by='time1')

In [38]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
class DataPreparator(BaseEstimator, TransformerMixin):
    """
    Fill Nan with zero values
    """
    def fit(self, values):
        return self
    def transform(self, X, y=None):
        sites = ['site%s' % i for i in range(1,11)]
        return X[sites].fillna(0).astype('int')
    

In [4]:
class ListPreparator(BaseEstimator, TransformerMixin):
    """
    Prepare a CVectorizer 2-D from data
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.values.tolist()
        return [" ".join([str(site) for site in row]) for row in X]


In [5]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    """
    Add new attributes to training and test set.
    """
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
        hour = X['time1'].apply(lambda ts: ts.hour)
        morning = ((hour >= 7) & (hour <= 11)).astype('int')
        day = ((hour >= 12) & (hour <= 18)).astype('int')
        evening = ((hour >= 19) & (hour <= 23)).astype('int')
        # night = ((hour >= 0) > (hour <= 6)).astype('int')
    
        
        month = X['time1'].apply(lambda ts: ts.month)
        summer = ((month >= 6) & (month <= 8)).astype('int')
        
        weekday = X['time1'].apply(lambda ts: ts.weekday()).astype('int')
        
        year = X['time1'].apply(lambda ts: ts.year).astype('int')

        X = np.c_[morning.values, day.values, evening.values, summer.values, weekday.values, year.values]
        return X


In [6]:
class ScaledAttributesAdder(BaseEstimator, TransformerMixin):
    """
    Add attributes that needs to be scaled
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        times = ['time%s' % i for i in range(1,11)]
        session_duration = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[s]').astype('int') ** .2
        number_of_sites = X[times].isnull().sum(axis=1).apply(lambda x: 10 - x)
        time_per_site = (session_duration / number_of_sites) ** .2
        # X = np.c_[session_duration.values, time_per_site.values]
        X = np.c_[session_duration.values]
        
        return X


In [7]:

vectorizer_pipeline = Pipeline([
    ('preparator', DataPreparator()),
    ('list_preparator', ListPreparator()),
    # ('vectorizer', TfidfVectorizer(**vectorizer_params))
    ('vectorizer', CountVectorizer(ngram_range=(1,3), max_features=50000))
    # ('vectorizer', TfidfVectorizer(ngram_range=(1,3), max_features=50000, tokenizer=lambda s: s.split()))
])

attributes_pipeline = Pipeline([
    ('adder', AttributesAdder())
])

scaled_attribs_pipeline = Pipeline([
    ('adder', ScaledAttributesAdder()),
    ('scaler', StandardScaler())
])


full_pipeline = FeatureUnion(transformer_list=[
    ('vectorizer_pipeline', vectorizer_pipeline),
    ('attributes_pipeline', attributes_pipeline),
    ('scaled_attribs_pipeline', scaled_attribs_pipeline)
])

In [8]:
X_train = full_pipeline.fit_transform(train_df)
X_test = full_pipeline.transform(test_df)
y_train = train_df['target'].astype('int').values

In [9]:
X_train

<253561x50007 sparse matrix of type '<class 'numpy.float64'>'
	with 4355861 stored elements in Compressed Sparse Row format>

## HYPERPARAMETER TUNING WITH HYPEROPT

In [10]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
import numpy as np
from time import time

In [11]:
time_split = TimeSeriesSplit(n_splits=5)

In [12]:
param_gridsearch = {
    'clf__learning_rate' : [0.01, 0.1, 1],
    'clf__max_depth' : [5, 10, 15],
    'clf__n_estimators' : [5, 20, 35], 
    'clf__num_leaves' : [5, 25, 50],
    'clf__boosting_type': ['gbdt', 'dart'],
    'clf__colsample_bytree' : [0.6, 0.75, 1],
    'clf__reg_lambda': [0.01, 0.1, 1],
}

In [13]:

param_random = {
    'clf__learning_rate': list(np.logspace(np.log(0.01), np.log(1), num = 500, base=3)),
    'clf__max_depth': list(range(5, 15)),
    'clf__n_estimators': list(range(5, 35)),
    'clf__num_leaves': list(range(5, 50)),
    'clf__boosting_type': ['gbdt', 'dart'],
    'clf__colsample_bytree': list(np.linspace(0.6, 1, 500)),
    'clf__reg_lambda': list(np.linspace(0, 1, 500)),
}

In [14]:

lgb_param_hyperopt= {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 15, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 5, 51, 1)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 5, 30, 1)),
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
}


In [15]:

def coarse_search(estimator, parameters, X_train, y_train, optimizer='grid_search', n_iter=None, scoring='accuracy', verbose=True):
    start = time()
    if optimize == 'grid_search':
        grid_obj = GridSearchCV(estimator=estimator,
                                param_grid=parameters,
                                cv=5,
                                refit=True,
                                return_train_score=False,
                                scoring=scoring,
                                verbose=True)
    elif optimizer == 'random_search':
        grid_obj = RandomizedSearchCV(estimator=estimator,
                                        param_distributions=parameters,
                                        cv=5,
                                        n_iter=n_iter,
                                        refit=True,
                                        return_train_score=False,
                                        scoring='accuracy',
                                        random_state=17,
                                        verbose=True)

    else: return 'Enter Search Method'
    
    local_best_estimator = grid_obj.best_estimator_
    cvs = cross_val_score(local_best_estimator, X_train, y_train, cv=5, scoring=scoring)
    results = pd.DataFrame(grid_obj.cv_results_)

    print("Results")
    print("========")
    print(f"Best Score: {grid_obj.best_score_}")
    print(f"Best parameters: {grid_obj.best_params_}")
    print(f"Cross Validation score mean: {cvs.mean()}")
    print(f"Cross Validation score std: {cvs.std()}")
    print(f"No of parameters combined: {result.shape[0]}")
    print(f"Time Elapsed: {time() - start}")

    return results, local_best_estimator

In [16]:

def hp_param_tuner(param_space, X_train, y_train, num_eval, cv=5, scoring='accuracy'):
    start = time()

    def objective_function(params):
        classifier = lgb.LGBMClassifier(**params)
        cv_score = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=cv, scoring=scoring)
        cv_score_mean = cv_score.mean()    
        print(f"LGB CV mean: {cv_score_mean}")
        return {'loss':1-cv_score_mean, 'status': STATUS_OK }
    
    trials = Trials()
    best_param = fmin(objective_function,
                        param_space,
                        algo=tpe.suggest,
                        max_evals=num_eval,
                        trials=trials,
                        rstate=np.random.RandomState(17))
    
    loss = [x['result']['loss'] for x in trials.trials]

    best_param_values = [x for x in best_param.values()]
    boosting_type = 'gbdt' if best_param_values[0] == 0 else 'dart'
    
    classifier_best = lgb.LGBMClassifier(learning_rate=best_param_values[2],
                                  num_leaves=int(best_param_values[5]),
                                  max_depth=int(best_param_values[3]),
                                  n_estimators=int(best_param_values[4]),
                                  boosting_type=boosting_type,
                                  colsample_bytree=best_param_values[1],
                                  reg_lambda=best_param_values[6],
                                 )
    classifier_best.fit(X_train, y_train)

    print("Results")
    print("========")
    print(f"Best Score: {-1 * min(loss)}")
    print(f"Best parameters: {best_param}")
    print(f"Time Elapsed: {time() - start}")
    print(f"No of parameters combined: {num_eval}")

In [22]:
lightgbm_hyperopt = hp_param_tuner(lgb_param_hyperopt, X_train, y_train, 50, TimeSeriesSplit(n_splits=5), 'roc_auc')

LGB CV mean: 0.874392591534259                        
LGB CV mean: 0.8691017234514924                                                     
LGB CV mean: 0.8743467221204583                                                     
LGB CV mean: 0.9163912284001526                                                     
LGB CV mean: 0.7405163817112621                                                     
LGB CV mean: 0.8854368951525011                                                     
LGB CV mean: 0.6726955367133411                                                     
LGB CV mean: 0.7508691515197301                                                     
LGB CV mean: 0.8803721099456199                                                     
LGB CV mean: 0.9060814625375404                                                     
LGB CV mean: 0.8080954170648564                                                      
LGB CV mean: 0.9153978834306692                                                      
LGB CV m

In [55]:
# best_params = {'boosting_type': 'gbdt', 'colsample_bytree': 0.970094563116432, 'learning_rate': 0.23393385174878922, 'max_depth': int(12.0), 'n_estimators': int(36.0), 'num_leaves': int(24.0), 'reg_lambda': 0.5681669098426798}
best_params = {'boosting_type': 'gbdt', 'colsample_bytree': 0.970094563116432, 'learning_rate': 0.23393385174878922, 'max_depth': int(12.0), 'n_estimators': int(100.0), 'num_leaves': int(24.0), 'reg_lambda': 0.5681669098426798}

In [56]:
best_lgb = lgb.LGBMClassifier(**best_params)

In [57]:
best_lgb

LGBMClassifier(colsample_bytree=0.970094563116432,
               learning_rate=0.23393385174878922, max_depth=12, num_leaves=24,
               reg_lambda=0.5681669098426798)

In [35]:
cross_val = cross_val_score(estimator=best_lgb, X=X_train, y=y_train, cv=TimeSeriesSplit(n_splits=5), scoring='roc_auc')

In [58]:
best_lgb.fit(X_train, y_train)

LGBMClassifier(colsample_bytree=0.970094563116432,
               learning_rate=0.23393385174878922, max_depth=12, num_leaves=24,
               reg_lambda=0.5681669098426798)

In [37]:
cross_val.mean()

0.9194131103688307

In [59]:
best_lgb_predict_labels = best_lgb.predict_proba(X_test)[:, 1]

In [60]:
write_to_submission_file(best_lgb_predict_labels, os.path.join('~/Desktop/python/DSML/ds_workspace/kaggle/codebase/alice/output/', 'estim_tuned_hyperopt_lgb_predict_labels.csv'))

In [44]:
estimator_grid = {
    'n_estimators': [36, 100, 1000, 2000]
}

In [43]:
params = {'boosting_type': 'gbdt', 'colsample_bytree': 0.970094563116432, 'learning_rate': 0.23393385174878922, 'max_depth': int(12.0), 'num_leaves': int(24.0), 'reg_lambda': 0.5681669098426798}
grid_search_lgb = lgb.LGBMClassifier(**params)

In [46]:
lgb_gridcv = GridSearchCV(estimator=grid_search_lgb, param_grid=estimator_grid, cv=TimeSeriesSplit(n_splits=5), scoring='roc_auc', verbose=True)

In [47]:
lgb_gridcv.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 217.5min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
             estimator=LGBMClassifier(colsample_bytree=0.970094563116432,
                                      learning_rate=0.23393385174878922,
                                      max_depth=12, num_leaves=24,
                                      reg_lambda=0.5681669098426798),
             param_grid={'n_estimators': [36, 100, 1000, 2000]},
             scoring='roc_auc', verbose=True)

In [48]:
print(lgb_gridcv.best_params_)
print(lgb_gridcv.best_estimator_)

{'n_estimators': 100}
LGBMClassifier(colsample_bytree=0.970094563116432,
               learning_rate=0.23393385174878922, max_depth=12, num_leaves=24,
               reg_lambda=0.5681669098426798)


In [50]:
cross_val = cross_val_score(estimator=best_lgb, X=X_train, y=y_train, cv=TimeSeriesSplit(n_splits=10), scoring='roc_auc')

In [51]:
print(cross_val.mean())

0.9044590662589881


In [52]:
lgb_gridcv.best_score_

0.9194775293550762

In [54]:
lgb_gridcv.fit(X_train, y_train)
lgb_grid_predicted_labels = lgb_gridcv.predict(X_test)[:, 1]
write_to_submission_file(lgb_grid_predicted_labels, 'tuned_lgb_grid.csv')

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 