In [1]:
import os
import time
import json
import datetime as dt
import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from bayes_opt import BayesianOptimization

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = lambda x : "{:,.2f}".format(x)
plt.rcParams['figure.figsize'] = (12,8)

## Agenda:
    - Load in train data set with selected engineered features
    - Train a number of different models and evaluate
        - Where possible use grid search
        - For larger parameter spaces use Bayesian Optimization or sequential parameter tuning.
    - Load in Test dataset and transform features so they align with our training dataset
    - Make predictions with best models
    - Ensemble model predictions

### Load In Dataset

In [3]:
# raw train and test
train = pd.read_csv('clean_data/train.csv', dtype={'Id':str})
test = pd.read_csv('clean_data/test.csv', dtype={'Id':str})

train.shape, test.shape

((15120, 54), (565892, 55))

In [4]:
# w engineered features
poly_train = pd.read_csv('clean_data/train_poly_final.csv')
poly_train.shape

(15120, 200)

In [5]:
# Reduce in-memory size of pandas dataframe by compressing dtypes
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else: df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [6]:
poly_train = reduce_mem_usage(poly_train)
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

ytrain = train['Cover_Type']

Memory usage of dataframe is 23.07 MB
Memory usage after optimization is: 5.77 MB
Decreased by 75.0%
Memory usage of dataframe is 6.23 MB
Memory usage after optimization is: 1.01 MB
Decreased by 83.8%
Memory usage of dataframe is 237.46 MB
Memory usage after optimization is: 38.32 MB
Decreased by 83.9%


### Modeling:

    - Logistic Regression
    - LDA
    - KNN
    - SVM
    - Random Forest and ExtraTrees
    - Multi Layer Perceptron
    - XGBoost
    - LightGBM

In [8]:
# no nulls
poly_train.isnull().sum().sum()

0

In [14]:
# Config
seed=1111
NCV=4 

def grid_search(mod, x, y, param_grid, scale=False):
    cv = StratifiedKFold(NCV, shuffle=True, random_state=seed)
    grid = GridSearchCV(mod, param_grid, scoring='accuracy', n_jobs=-1, verbose=1,
                        cv=cv)
    
    if scale:
        x = scale.fit_transform(x)
    # not sure what happened but multi-processing was freezing...
    with parallel_backend('threading'):
        grid.fit(x,y)
    
    print("Best Score:", grid.best_score_)
    print(grid.best_params_)
    return grid

def bayes_search(mod, x, y, param_bounds, default_params=None, ninit=10, niter=50):
    default_params = default_params or {}
    
    # infer type of parameters and also cast categorical params to ints
    param_dtypes = dict.fromkeys(list(param_bounds.keys()))
    cat_params_map = {} # for exploring categorical/str parameter options
    for p, val in param_bounds.items():
        # check all same type
        assert all([type(inner) == type(val[0]) for inner in val])
        
        param_dtypes[p] = type(val[0])
        if not isinstance(val[0], (int, float)):
            print(val)
            # for str variables, options should be a list/tuple of possible values
            cat_params_map[p] = {i:v for i,v in enumerate(val)} 
            param_bounds[p] = (0, len(val)-1)
            
    print(param_bounds)   
    
    # Inner function will take numeric params and change them to appropriate dtypes
    # Then initiate and cross_val the estimator
    def mod_func(**params):
        for p,v in params.items():
            if param_dtypes[p] == int:
                params[p] = int(np.round(v,0))
            elif param_dtypes[p] != float: # all non numeric are categorical (includes tuples for mlp)
                params[p] = cat_params_map[p][int(np.round(v,0))]
                
        
        est = mod(**default_params, **params)
        cv = StratifiedKFold(NCV, True, seed)
        acc = np.mean(cross_val_score(est, x, y, scoring='accuracy', cv=cv, n_jobs=-1))
        return acc
    
    # run bayesopt
    start = time.time()
    BO = BayesianOptimization(mod_func, param_bounds, random_state=seed)
    BO.maximize(ninit, niter)
    print('TIME TAKEN (MIN):', (time.time() - start)/60)
    return BO

__Logistic Regression__

In [17]:
lreg_param_grid = {
    'C': np.arange(0.25,1.1,0.25),
    'penalty': ['l1', 'l2']
}

lreg_grid = grid_search(LogisticRegression(), poly_train, ytrain, lreg_param_grid, scale=MinMaxScaler())

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:  1.5min finished


Best Score: 0.7273809523809524
{'C': 1.0, 'penalty': 'l1'}


In [18]:
lreg_results = lreg_grid.best_params_
lreg_results.update({'accuracy': lreg_grid.best_score_})

with open('models/lreg_results.json', 'w') as f:
    json.dump(lreg_results, f)

__LDA__

In [19]:
lda_param_grid = {
    'n_components': [None, 25, 50, 100, 150, 199]
}
lda_grid = grid_search(LinearDiscriminantAnalysis(), poly_train, ytrain, lda_param_grid)

Fitting 4 folds for each of 6 candidates, totalling 24 fits


  return umr_sum(a, axis, dtype, out, keepdims)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)
  ret = umr_sum(x, axis, dtype, out, keepdims)
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    7.9s finished
  return umr_sum(a, axis, dtype, out, keepdims)
  ret = umr_sum(x, axis, dtype, out, keepdims)


Best Score: 0.7117063492063492
{'n_components': None}




In [20]:
lda_results = lda_grid.best_params_
lda_results.update({'accuracy': lda_grid.best_score_})

with open('models/lda_results.json', 'w') as f:
    json.dump(lda_results, f)

__KNN__

In [24]:
knn_param_grid = {
    'n_neighbors': [2**n for n in range(1,6)],
    'weights' : ['uniform', 'distance']
}

knn_grid = grid_search(KNeighborsClassifier(), poly_train, ytrain, knn_param_grid, scale=MinMaxScaler())

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best Score: 0.8353174603174603
{'n_neighbors': 2, 'weights': 'distance'}


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.1min finished


In [25]:
knn_results = knn_grid.best_params_
knn_results.update({'accuracy': knn_grid.best_score_})

with open('models/knn_results.json', 'w') as f:
    json.dump(knn_results, f)

__SVM__

In [27]:
svc_param_grid = {
    'C' : [0.25, 0.5, 0.75, 1.],
    'kernel' : ['linear','rbf', 'poly', 'sigmoid']
}
svm_grid = grid_search(SVC(degree=2, max_iter=5000), poly_train, ytrain, svc_param_grid, scale=MinMaxScaler())

Fitting 4 folds for each of 16 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed: 15.1min finished


Best Score: 0.736574074074074
{'C': 1.0, 'kernel': 'linear'}




In [29]:
svm_results = svm_grid.best_params_
svm_results.update({'accuracy': svm_grid.best_score_})

with open('models/svm_results.json', 'w') as f:
    json.dump(svm_results, f)

The next models require a lot of parameter tuning, so switching to Bayesian Optimization

__Random Forest__

In [None]:
rf_bounds = {
    'criterion': ('gini', 'entropy'),
    'n_estimators': (10,150),
    'max_depth': (3,25),
    'max_features': (2,50)
}

rf_opt = bayes_search(RandomForestClassifier, poly_train, ytrain, rf_bounds, ninit=10, niter=100)
rf_opt.res['max']

In [None]:
rf_results = rf_opt.res['max']['max_params']
rf_results.update({rf_opt.res['max']['max_val']})

with open('models/rf_results.json', 'w') as f:
    json.dump(rf_results, f)

__Extra Trees__

In [None]:
et_bounds = {
    'criterion': ('gini', 'entropy'),
    'n_estimators': (10,150),
    'max_depth': (3,25),
    'max_features': (2,50)
}

et_opt = bayes_search(ExtraTreesClassifier, poly_train, ytrain, et_bounds, 
                   default_params={'bootstrap':True}, ninit=10, niter=100)
et_opt.res['max']

In [None]:
et_results = et_opt.res['max']['max_params']
et_results.update({et_opt.res['max']['max_val']})

with open('models/et_results.json', 'w') as f:
    json.dump(et_results, f)

__MLP__

In [None]:
mlp_bounds = {
    'hidden_layer_sizes': [(25, 15, 10), (25, 25), (50,)],
    'activation' : ['logistic', 'tanh', 'relu'],
    'learning_rate_init': (0.0001, 0.1),
    'beta_1': (0.1, 0.9),
    'beta_2': (0.1, 0.999),
    'alpha' : (0.00001, 0.01)
}

mlp_default_args: {
    'random_state': seed, 
}


mlp_opt = bayes_search(MLPClassifier, poly_train, ytrain, mlp_bounds, mlp_default_args, ninit=10, niter=100)
mlp_opt.res['max']

In [None]:
mlp_results = mlp_opt.res['max']['max_params']
mlp_results.update({mlp_opt.res['max']['max_val']})

with open('models/mlp_results.json', 'w') as f:
    json.dump(mlp_results, f)

__LGBM__

In [None]:
lgbm_bounds = {
    'boosting_type': ['gbdt', 'dart'],
    'max_depth': (3,25),
    'learning_rate': (0.001, 0.9),
    'n_estimators': (10,100),
    'min_split_gain': (0.001, 0.1),
    'subsample': (0.01,0.66),
    'colsaple_bytree': (0.01,0.5),
    'reg_alpha': (0.00, 1.00),
    'reg_lambda': (0.00, 1.00),
    
}

lgbm_default_args: {
    'random_state': seed, 
    'objective': 'multiclass'
}


lgbm_opt = bayes_search(LGBMClassifier, poly_train, ytrain, lgbm_bounds, lgbm_default_args, ninit=10, niter=100)
lgbm_opt.res['max']

In [None]:
lgbm_results = lgbm_opt.res['max']['max_params']
lgbm_results.update({lgbm_opt.res['max']['max_val']})

with open('models/lgbm_results.json', 'w') as f:
    json.dump(lgbm_results, f)

__XGBoost__

In [None]:
# NOT IMPLEMENTED
xgb_bounds = {
#     'boosting_type': ['gbdt', 'dart'],
#     'max_depth': (3,25),
#     'learning_rate': (0.001, 0.9),
#     'n_estimators': (10,100),
#     'min_split_gain': (0.001, 0.1),
#     'subsample': (0.01,0.66),
#     'colsaple_bytree': (0.01,0.5),
#     'reg_alpha': (0.00, 1.00),
#     'reg_lambda': (0.00, 1.00),
    
}

xgb_default_args: {
#     'random_state': seed, 
#     'objective': 'multiclass'
}


xgb_opt = bayes_search(LGBMClassifier, poly_train, ytrain, xgb_bounds, xgb_default_args, ninit=10, niter=100)
xgb_opt.res['max']

In [None]:
xgb_results = xgb_opt.res['max']['max_params']
xgb_results.update({xgb_opt.res['max']['max_val']})

with open('models/xgb_results.json', 'w') as f:
    json.dump(xgb_results, f)

In [None]:
# all_results

all_results = [
    lreg_results,
    lda_results,
    knn_results,
    svm_results,
    
    rf_results,
    et_results,
    mlp_results,
    lgbm_results,
    xgb_results
]



### Transform Test Set

### Submit Predictions

In [None]:
test_id = test.Id
xtest = test.iloc[:,1:]

In [None]:
def write_submission(mod, params, xtrain, ytrain, test_id, xtest):

    est = mod(**params)
    est.fit(xtrain,ytrain)
    preds = est.predict(xtest)
    
    
    name = repr(est.base_estimator.__class__).split('.')[-1].strip('>').strip("'")
    est_id = len([s for s in os.listdir('Submissions/') if s.startswith(name)])+1
    fname = "{}{}_{}".format(name, est_id, dt.date.today().strftime('%Y%m%d'))
    
    preds_df = pd.concat([pd.Series(test_id), pd.Series(preds)], axis=1)
    preds_df.to_csv('Submissions/{}.csv'.format(fname), index=False)
    
    # write model params
    with open('Submissions/{}_params.json'.format(fname), 'w') as f:
        json.dump(params, f)
        
    return 
    
    

In [None]:
write_submission(LGBMClassifier, {}, poly_train, ytrain, test_id, xtest)

### Ensembling!