In [None]:
# Input data files are available in the "../input/" directory.
# Any results you write to the current directory are saved as output.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import copy

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline

%matplotlib inline

In [None]:
# Local environment
data_path = '../../data/learn-together'

# Kaggle
#data_path = '../input/learn-together'

df_test = pd.read_csv(data_path + '/test.csv')
df_sample_submission = pd.read_csv(data_path + '/sample_submission.csv')
df = pd.read_csv(data_path + '/train.csv')

In [None]:
grid_search = True
generate_output = True

In [None]:
df.head()

In [None]:
# Identify columns with only 1 value, these are unlikely to be helpful
col_singular = [col for col in df.columns if df[col].nunique() == 1]
print('Singular columns: {}'.format(col_singular))

In [None]:
# Check if target types are evenly spread
plt.ylabel('frequency')
plt.xlabel('cover type')
plt.bar(df['Cover_Type'].unique(), df['Cover_Type'].value_counts(), color ='green', width=0.2)
plt.rcParams["figure.figsize"] = (5,5)
plt.show()

# Evenly distributed, **that's great**

In [None]:
# Separate features and target
target = 'Cover_Type'
features = list(df.columns)
features.remove(target)

X = df[features]
y = df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=5)

In [None]:
# Get correlation to see if dimensionality can be reduced. 
# Only considering non-categorical columns for simplicity
df_subset = df[['Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Cover_Type']]

corrmat = df_subset.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(10,10))
g=sns.heatmap(df_subset[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
# Looks like Aspect has the lowest correlation with Cover_Type, and has a high correlation with 
# Hillshade_9am and Hillshade_3pm
# But Hillshade_3pm has a decent correlation with Cover_Type; verify all combinations
low_correlation_cols = ['Aspect', 'Hillshade_9am', 'Hillshade_3pm']

In [None]:
# Determine which models to evaluate
models = [(DecisionTreeClassifier(random_state=5), 'Decision Tree', True)
          , (RandomForestClassifier(random_state=5), 'Random Forest', True)
          , (XGBClassifier(random_state=5), 'XGBoost', True)]

In [None]:
# Function to get initial scores list of models
def get_init_scores(features: list, model: list) -> list:
    scores = []
    X_func = X_train[features]
    for model, name, flag in models:
        if flag:
            score = cross_val_score(model, X_func, y_train, cv=5, scoring='accuracy').mean()
            new_score = {'model': name, 'score': score}
            scores.append(new_score)
    return scores

In [None]:
initial_scores = get_init_scores(features, models)
initial_scores

In [None]:
# Cases, different combinations of columns to be dropped
cases = []
scenarios = []
case_A = ['']
case_B = col_singular
case_C = col_singular + ['Aspect']
case_D = col_singular + ['Aspect', 'Hillshade_9am']
case_E = col_singular + ['Aspect', 'Hillshade_3pm']
case_F = col_singular + ['Aspect', 'Hillshade_9am', 'Hillshade_3pm']

cases += [case_A]
cases += [case_B]
cases += [case_C]
cases += [case_D]
cases += [case_E]
cases += [case_F]

for model, model_name, flag in models:
    if flag:
        for case in cases:
            scenario = {}
            features_subset = copy.deepcopy(features)
            for col in case:
                if col in features_subset:
                    features_subset.remove(col)
            scenario['model_name'] = model_name
            scenario['columns_dropped'] = case
            scenario['features'] = features_subset
            scenario['model'] = model
            scenarios.append(scenario)


#for scenario in scenarios:
#    score = cross_val_score(model, X_train[scenario['features']], y_train, cv=5, scoring='accuracy').mean()
#    scenario['score'] = score

#for scenario in sorted(scenarios, key = lambda i: i['score'], reverse=True):
#    print('Model: {model}\nColumns dropped: {cols}\nScore: {score:.4f}\n'.format(model=scenario['model_name']
#                                                                                 , cols=scenario['columns_dropped']
#                                                                                 , score=scenario['score']))

In [None]:
outputs = []
keys = ['model_name', 'columns_dropped', 'score']
#for scenario in sorted(scenarios, key = lambda i: i['score'], reverse=True):
#    output = {}
#    output = {key: scenario[key] for key in keys}
#    outputs.append(output)
outputs = [{'model_name': 'Decision Tree',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_9am'],
  'score': 0.7638101896377367},
 {'model_name': 'Random Forest',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_9am'],
  'score': 0.7638101896377367},
 {'model_name': 'XGBoost',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_9am'],
  'score': 0.7638101896377367},
 {'model_name': 'Decision Tree',
  'columns_dropped': [''],
  'score': 0.7626546001058426},
 {'model_name': 'Decision Tree',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15'],
  'score': 0.7626546001058426},
 {'model_name': 'Random Forest',
  'columns_dropped': [''],
  'score': 0.7626546001058426},
 {'model_name': 'Random Forest',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15'],
  'score': 0.7626546001058426},
 {'model_name': 'XGBoost',
  'columns_dropped': [''],
  'score': 0.7626546001058426},
 {'model_name': 'XGBoost',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15'],
  'score': 0.7626546001058426},
 {'model_name': 'Decision Tree',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect'],
  'score': 0.7625706219483699},
 {'model_name': 'Random Forest',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect'],
  'score': 0.7625706219483699},
 {'model_name': 'XGBoost',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect'],
  'score': 0.7625706219483699},
 {'model_name': 'Decision Tree',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_9am', 'Hillshade_3pm'],
  'score': 0.7614941575593148},
 {'model_name': 'Random Forest',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_9am', 'Hillshade_3pm'],
  'score': 0.7614941575593148},
 {'model_name': 'XGBoost',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_9am', 'Hillshade_3pm'],
  'score': 0.7614941575593148},
 {'model_name': 'Decision Tree',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_3pm'],
  'score': 0.7598421860923201},
 {'model_name': 'Random Forest',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_3pm'],
  'score': 0.7598421860923201},
 {'model_name': 'XGBoost',
  'columns_dropped': ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_3pm'],
  'score': 0.7598421860923201}]

In [None]:
# Below case had the best results
features_subset = copy.deepcopy(features)
cols_dropped = ['Soil_Type7', 'Soil_Type15', 'Aspect', 'Hillshade_9am']
for col in cols_dropped:
    if col in features_subset:
        features_subset.remove(col)


X = X[features_subset]
X_train = X_train[features_subset]
X_val = X_val[features_subset]
df_test = df_test[features_subset]

In [None]:
# Random forest gave the best initial result. Will tune it further
# Do a grid search for different combinations

grid_search_results = []
model = RandomForestClassifier(criterion='gini',random_state=5)
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle=True)
pipeline = Pipeline(steps=[('model', model)])
best_model = copy.deepcopy(model)

if grid_search:
    parameters = {}
    parameters['model__n_estimators'] = [100, 150, 200, 400, 600, 800, 1000, 1100]
    #parameters['model__max_depth'] = range(10, 21, 1)
    print(parameters)
    
    CV = GridSearchCV(pipeline, param_grid=parameters, scoring = 'accuracy', n_jobs=-1, cv=kfold)
    CV.fit(X_train, y_train)   
    
    best_model = CV.best_estimator_
    print('Best score and parameter combination = ')
    print(CV.best_score_)    
    print(CV.best_params_) 

## **Saved outputs from grid searches**

* max_depth = 20 [19,20,21]
* min_samples_leaf = 1 [1,2,3]
* n_estimators = 200 [100,150,200]
* **Score: 0.87**


* max_depth = 21 [19,20,21]
* min_samples_leaf = 1 [1,2,3]
* n_estimators = 300 [200,250,300]
* **Score: 0.875**

In [None]:
# Test against holdout set
best_model.fit(X, y)
#preds = best_model.predict(X_val)
#score = accuracy_score(y_val, preds)
#print(score)

In [None]:
# Final output
if generate_output:
    preds = best_model.predict(df_test)

    # Save test predictions to file
    output = pd.DataFrame({'Id': df_sample_submission.Id,
                       'Cover_Type': preds})
    output.head()
    output.to_csv('submission.csv', index=False)