# Table of contents <a id="0"></a>

* [Imports](#imports)
* [Block selection](#block-selection)
* [Define models](#define-models)
* [Identify Important Features](#identify-important-features)
* [Feature engineering](#feature-engineering)
* [Base estimators grid search](#base-estimators-grid-search)
* [Stacking intro](#stacking-intro)
* [Utility functions](#utility-functions)
* [Determine best combination](#best-combination)
* [Generate output](#generate-output)

# Imports <a id="imports"></a>
[Go back to top](#0)

In [None]:
dir = !ls -a
if ('kernel-metadata.json' in dir):
    src = 'Laptop'
    # Local environment
    data_path = '../../data/learn-together'
else:
    # Kaggle environment
    src = 'Kaggle'
    data_path = '../input'

print('Environment set to [{env}]'.format(env=src))

In [None]:
# Input data files are available in the "../input/" directory.
# Any results you write to the current directory are saved as output.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# System imports
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier

# Utilities
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from itertools import combinations

> # Block selection <a id="block-selection"></a>
[Go back to top](#0)

In [None]:
low_correlation_features = []
additional_feature_columns = []
grid_search_n_splits = 5
layer_one_folds = 10

get_feature_importances = 0
drop_low_correlation_features = 0
run_grid_search_for_base_estimators = 1
determine_optimal_base_estimators_combination = 0

generate_output = 1

# Define models <a id="define-models"></a>
[Go back to top](#0)

In [None]:
# Define level 1 estimators
base_models = []

model = {'model': RandomForestClassifier(random_state = 5)}
parameters = {'n_estimators': [100, 150, 200, 400, 600, 800, 1000, 1100]}
model['parameters'] = parameters
model['grid_search'] = 1
base_models.append(model)

model = {'model': KNeighborsClassifier()}
parameters = {'n_neighbors': range(3,12,2), 
              'weights': ['uniform', 'distance']}
model['parameters'] = parameters
model['grid_search'] = 1
base_models.append(model)

model = {'model': LogisticRegression(random_state=5)}
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
model['parameters'] = parameters
model['grid_search'] = 1
base_models.append(model)

# Define Stacking estimator
stack_model = RandomForestClassifier(n_estimators=600, random_state=5)

In [None]:
# Import dataset
df_test = pd.read_csv(data_path + '/test.csv')
df_sample_submission = pd.read_csv(data_path + '/sample_submission.csv')
df = pd.read_csv(data_path + '/train.csv')

# Identify important features <a id="identify-important-features"></a>
[Go back to top](#0)

In [None]:
# Identify columns with only 1 value, these are unlikely to be helpful
col_singular = [col for col in df.columns if df[col].nunique() == 1]
print('Singular columns: {}'.format(col_singular))

# Drop singular columns
df.drop(col_singular, axis=1, inplace=True)
df_test.drop(col_singular, axis=1, inplace=True)

In [None]:
if get_feature_importances:
    target = 'Cover_Type'
    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    bestfeatures = SelectKBest(k=10)
    fit = bestfeatures.fit(X, y)
    
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    
    # Concat two dataframes for better visualization 
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score'] 
    print(featureScores.nlargest(20,'Score'))

In [None]:
if get_feature_importances:
    model = ExtraTreesClassifier()
    model.fit(X,y)
    print(model.feature_importances_) 
    
    # plot graph of feature importances for better visualization
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    feat_importances.nlargest(10).plot(kind='barh')
    plt.show()

In [None]:
# Get correlation to see if dimensionality can be reduced. 
# Only considering non-categorical columns for simplicity
df_subset = df[['Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Cover_Type']]

corrmat = df_subset.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(10,10))
g=sns.heatmap(df_subset[top_corr_features].corr(),annot=True,cmap="RdYlGn")

# Feature engineering <a id="feature-engineering"></a>
[Go back to top](#0)

In [None]:
# Drop low correlation feature
if drop_low_correlation_features:
    df.drop(low_correlation_features, axis=1, inplace=True)
    df_test.drop(low_correlation_features, axis=1, inplace=True)

In [None]:
# Separate training and validation
target = 'Cover_Type'
features = list(df.columns)
features.remove(target)

X = df[features]
y = df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=5)

# Base estimators grid search <a id="base-estimators-grid-search"></a>
[Go back to top](#0)

In [None]:
# Do grid search on each base model
optimum_base_models = []
if run_grid_search_for_base_estimators:
    for model in base_models:
        if model['grid_search']:
            print('Model: {model_name}'.format(model_name=model['model'].__class__.__name__))
            print('Optimizing parameters: [{params}]'.format(params=model['parameters']))
            kfold = KFold(n_splits=grid_search_n_splits, shuffle=True)
            CV = GridSearchCV(model['model']
                          , param_grid=model['parameters']
                          , scoring = 'accuracy'
                          , n_jobs=-1
                          , cv=kfold)
            CV.fit(X_train, y_train)
            best_model = CV.best_estimator_
            model['best_model'] = best_model
            print('Best score and parameter combination = ')
            print(CV.best_score_)    
            print(CV.best_params_) 

for model in base_models:
    if 'best_model'not in model:
        model['best_model'] = model['model']
    optimum_base_models.append(model['best_model'])    

In [None]:
# Get individual predictions
print('Initial scores: ')
for base_model in base_models:
    model = base_model['model']
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print('{} Accuracy: {:.2f}%'.format(model.__class__.__name__, acc * 100))

if run_grid_search_for_base_estimators:
    print('\nAfter grid search: \n')
    for model in optimum_base_models:
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        print('{} Accuracy: {:.2f}%'.format(model.__class__.__name__, acc * 100))

# Stacking intro <a id="stacking-intro"></a>
[Go back to top](#0)

## Layer 1
1. Init
  - Separate df (input) into features (X) and target (y)
2. Layer 1, loop across folds
  - Separate X and y into training and validation sets. Keep validation set aside and ignore for now
  - For the first base model, split the training data into n folds
  - Fit the base model on X_train, y_train from n-1 folds, and predict on the remaining nth fold. 
  - Add the predictions into a meta series for the base model
3. Repeat (2), n times, each time with new nth fold. This will cover full training data. Will then end up with predictions for all n folds (full training data) in the meta seies (number of rows same as X_train)
4. Loop across models
  - Repeat (1), (2) and (3) for all base models
  - Combine meta series for each base model into meta df - one column per base model, number of rows same as original training data. 
5. Optionally add an original feature from the training data into the meta dataframe
6. This dataframe is feature data for next stage

## Layer 2
7. Fit the stacking model on the meta Dataframe (output of (6)) and y_train. This is our stacked model.

## Validation
8. Fit first base model on the full input data (X, y). Predict on X_val, this will generate a meta series
9. Repeat (8) across each base model, combine output series from each base model to make meta dataframe for next layer
10. Add the same feature column (as in (5)), from X_val into the meta dataframe from (9). This is source for next layer
11. Predict using stacked model (output of (7)) on the output of (10)
12. Compare prediction from (11) with y_val to get score

## Testing
13. Repeat steps (8), (9), (10), (11), this time using df_test in place of X_val

# Utility functions <a id="utility-functions"></a>
[Go back to top](#0)

In [None]:
def fit_base_models(model_list_in, X_in, y_in):
    X_local = copy.deepcopy(X_in)
    y_local = copy.deepcopy(y_in)
    model_list = copy.deepcopy(model_list_in)
    fitted_model_list = []
    for model in model_list:
        model.fit(X_local, y_local)
        fitted_model_list.append(model)
    return fitted_model_list

In [None]:
def get_meta_preds(fitted_model_list_in, X_in):
    X_local = copy.deepcopy(X_in)
    fitted_model_list = copy.deepcopy(fitted_model_list_in)
    meta_df = pd.DataFrame()
    for model in fitted_model_list:
        y_local = model.predict(X_local)

        y_local_df = pd.DataFrame(y_local, index=X_local.index)
        meta_df[fitted_model_list.index(model)] = y_local_df[0]
    return meta_df

In [None]:
def process_layer_one(base_model_list_in, stack_model_in, X_in, y_in, feature_columns):
    X_local = copy.deepcopy(X_in)
    y_local = copy.deepcopy(y_in)
    model_list = copy.deepcopy(base_model_list_in)
    stack_model_local = copy.deepcopy(stack_model_in)
    
    meta_df = pd.DataFrame()
    kfold = KFold(n_splits=layer_one_folds, shuffle=True)
    
    # 4. Loop across models
    for train_idx, hold_out_idx in kfold.split(X_local): 
        meta_fold = pd.DataFrame()
        X_fold = X_local.iloc[train_idx]    
        y_fold = y_local.iloc[train_idx]
        X_hold_out_fold = X_local.iloc[hold_out_idx]
        
        train_fold_fitted_models = fit_base_models(model_list, X_fold, y_fold)
        meta_fold  = get_meta_preds(train_fold_fitted_models, X_hold_out_fold)
            
        # Combine into meta df - one column per base model, number of rows same as original training data. 
        meta_df = pd.concat([meta_df, meta_fold])
    #meta_df[feature_columns] = X_local[feature_columns]
    stack_model_local.fit(meta_df, y_local)
    #pred = stack_model_local.predict(X_val_in)
    #score = accuracy_score(y_val_in, pred)
    return stack_model_local
    

# Determine best combination of base models <a id="best-combination"></a>
[Go back to top](#0)

In [None]:
if determine_optimal_base_estimators_combination:
    scores = []
    for i in range(1, len(optimum_base_models)+1):
        model_list_combinations += [list(m) for m in combinations(optimum_base_models, i)]

    for model_list in model_list_combinations:
        meta_df = get_fold_meta_from_layer_zero(optimum_base_models, X_train, y_train, feature_columns)
        fitted_stack_model = stack_model.fit(meta_df, y_train)
        y_val_pred = fitted_stack_model.predict(X_val)
        score = accuracy_score(y_val, y_val_pred)
        
        comparison = {'score': score, 'model_list': model_list, 'layer_two_model': fitted_stack_model, 'feature_columns': feature_columns}
        scores.append(comparison)
    
    for model_score in sorted(scores, key = lambda x: x['score'], reverse=True):
        models = []
        for model in model_score['model_list']:
            model_short = model.__class__.__name__
            models.append(model_short)
        print('Models: \n{models}\nScore: {score}\nTotal Columns: {columns}\n'.format(models=models, score=model_score['score'], columns=model_score['total_columns']))


In [None]:
fitted_stack_model = process_layer_one(optimum_base_models, stack_model
                                              , X_train
                                              , y_train
                                              , additional_feature_columns)

# Generate output <a id="generate-output"></a>
[Go back to top](#0)

In [None]:
# 7. Fit each base model on the test data, this will generate a meta series
if generate_output:
    fitted_base_models = fit_base_models(optimum_base_models, X, y)
    test_meta  = get_meta_preds(fitted_base_models, df_test)
    #test_meta[additional_feature_columns] = df_test[additional_feature_columns]
    test_pred = fitted_stack_model.predict(test_meta)

    # Final output
    # Save test predictions to file
    output = pd.DataFrame({'Id': df_sample_submission.Id,
                       'Cover_Type': test_pred})
    output.to_csv('submission.csv', index=False)