In [108]:
import os
import json
import datetime as dt
import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from bayes_opt import BayesianOptimization

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = lambda x : "{:,.2f}".format(x)
plt.rcParams['figure.figsize'] = (12,8)

## Agenda:
    - Load in train data set with selected engineered features
    - Train a number of different models and evaluate
        - Where possible use grid search
        - For larger parameter spaces use Bayesian Optimization or sequential parameter tuning.
    - Load in Test dataset and transform features so they align with our training dataset
    - Make predictions with best models
    - Ensemble model predictions

### Load In Dataset

In [20]:
# raw train and test
train = pd.read_csv('clean_data/train.csv', dtype={'Id':str})
test = pd.read_csv('clean_data/test.csv', dtype={'Id':str})

ytrain = train['Cover_Type']
train.shape, test.shape

((15120, 54), (565892, 55))

In [11]:
# w engineered features
poly_train = pd.read_csv('clean_data/train_poly_final.csv')
poly_train.shape

(15120, 200)

### Modeling:

    - Logistic Regression
    - LDA
    - KNN
    - SVM
    - Random Forest and ExtraTrees
    - AdaBoost
    - Multi Layer Perceptron
    - XGBoost
    - LightGBM

In [85]:
# no nulls
poly_train.isnull().sum().sum()

0

In [107]:
# Config
seed=1111
NCV=4 

def grid_search(mod, x, y, param_grid, scale=False):
    cv = StratifiedKFold(NCV, shuffle=True, random_state=seed)
    grid = GridSearchCV(mod, param_grid, scoring='accuracy', n_jobs=-1, verbose=1,
                        cv=cv)
    
    if scale:
        x = scale.fit_transform(x)
    
    grid.fit(x,y)
    
    print("Best Score:", grid.best_score_)
    print(grid.best_params_)
    return grid


__Logistic Regression__

In [93]:
lreg_param_grid = {
    'C': np.arange(0.25,1.1,0.25),
    'penalty': ['l1', 'l2']
}

grid = grid_search(LogisticRegression(), poly_train, ytrain, lreg_param_grid)

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  3.2min finished


Best Score: 0.7216269841269841


__LDA__

In [None]:
lda_param_grid = {
    'n_components': [None, 25, 50, 100, 150, 199]
}
grid = grid_search(LinearDiscriminantAnalysis(), poly_train, ytrain, lda_param_grid)

__KNN__

In [None]:
knn_param_grid = {
    'n_neighbors': [2**n for n in range(6)],
    'weights' : ['uniform', 'distance']
}

grid = grid_search(KNeighborsClassifier(), poly_train, ytrain, knn_param_grid)

__SVM__

In [None]:
svc_param_grid = {
    'C' = [0.25, 0.5, 0.75, 1.]
    'kernel' = ['linear','rbf', 'poly', 'sigmoid']
}
grid = grid_search(SVC(degree=2, max_iter=5000), poly_train, ytrain, svc_param_grid, scale=MinMaxScaler())

The next models require a lot of parameter tuning, so switching to Bayesian Optimization

__Random Forest__

__Extra Trees__

__MLP__

__LGBM__

__XGBoost__

__Ada Boost__

### Transform Test Set

### Submit Predictions

In [77]:
test_id = test.Id
xtest = test.iloc[:,1:]

In [78]:
def write_submission(mod, params, xtrain, ytrain, test_id, xtest):

    est = mod(**params)
    est.fit(xtrain,ytrain)
    preds = est.predict(xtest)
    
    
    name = repr(est.base_estimator.__class__).split('.')[-1].strip('>').strip("'")
    est_id = len([s for s in os.listdir('Submissions/') if s.startswith(name)])+1
    fname = "{}{}_{}".format(name, est_id, dt.date.today().strftime('%Y%m%d'))
    
    preds_df = pd.concat([pd.Series(test_id), pd.Series(preds)], axis=1)
    preds_df.to_csv('Submissions/{}.csv'.format(fname), index=False)
    
    # write model params
    with open('Submissions/{}_params.json'.format(fname), 'w') as f:
        json.dump(params, f)
        
    return 
    
    

In [79]:
write_submission(LGBMClassifier, {}, poly_train, ytrain, test_id, xtest)

ValueError: Number of features of the model must match the input. Model n_features_ is 200 and input n_features is 52 

### Ensembling!