In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')

# display train data
train.head()

In [None]:
# drop ID column
train = train.iloc[:,1:]
train.head()

In [None]:
train.describe()

# EDA

In [None]:
def outlier_function(df, col_name):
    first_quartile = np.percentile(np.array(df[col_name].tolist()), 25)
    third_quartile = np.percentile(np.array(df[col_name].tolist()), 75)
    IQR = third_quartile - first_quartile
    
    upper_limit = third_quartile+(3*IQR)
    lower_limit = first_quartile-(3*IQR)
    outlier_count = 0
    
    for value in df[col_name].tolist():
        if (value < lower_limit) | (value > upper_limit):
            outlier_count += 1
    return lower_limit, upper_limit, outlier_count

Note: As `Wilderness_Area` and `Soil_Type` are one-hot encoded, we can focus on the following:


- There are 53 outliers in Horizontal_Distance_To_Hydrology
- There are 49 outliers in Vertical_Distance_To_Hydrology
- There are 3 outliers in Horizontal_Distance_To_Roadways
- There are 7 outliers in Hillshade_9am
- There are 20 outliers in Hillshade_Noon
- There are 132 outliers in Horizontal_Distance_To_Fire_Points

In [None]:
for column in train.columns:
    if outlier_function(train, column)[2] > 0:
        print("There are {} outliers in {}".format(outlier_function(train, column)[2], column))

In [None]:
# remove outliers from Fire Points with highest range of outliers
train = train[(train['Horizontal_Distance_To_Fire_Points'] > 
               outlier_function(train, 'Horizontal_Distance_To_Fire_Points')[0]) &
            (train['Horizontal_Distance_To_Fire_Points'] <
            outlier_function(train, 'Horizontal_Distance_To_Fire_Points')[1])]

In [None]:
train.describe()

# Feature Engineering & Selection

There are horizontal and vertical distance to hydrology features, which blinks for adding the euclidian distance of the two.

In [None]:
train['Euclidian_Distance_To_Hydrology'] = (train['Horizontal_Distance_To_Hydrology']**2 + train['Vertical_Distance_To_Hydrology']**2)**0.5
train['Mean_Elevation_Vertical_Distance_Hydrology'] = (train['Elevation'] + train['Vertical_Distance_To_Hydrology'])/2
train['Mean_Distance_Hydrology_Firepoints'] = (train['Horizontal_Distance_To_Hydrology'] + train['Horizontal_Distance_To_Fire_Points'])/2
train['Mean_Distance_Hydrology_Roadways'] = (train['Horizontal_Distance_To_Hydrology'] + train['Horizontal_Distance_To_Roadways'])/2
train['Mean_Distance_Firepoints_Roadways'] = (train['Horizontal_Distance_To_Fire_Points'] + train['Horizontal_Distance_To_Roadways'])/2

train

In [None]:
train.dtypes

# Preprocessing

In [None]:
# create cat, num and y
# create categorical features
X_cat = train.iloc[:,10:54].values

# numerical features
# X_num_ori = train.iloc[:,0:10].values
# X_num_new = train.iloc[:,56:60].values
X_num = train.iloc[:, np.r_[0:10, 55:60]].values

# create y
y = train.iloc[:,-54].values

In [None]:
# scale/standardize numerical columns
scaler = StandardScaler() # scaler object
scaler.fit(X_num) # fit training data
X_num = scaler.transform(X_num) # scale num columns

# shape
print(f'Categorical Shape: {X_cat.shape}')
print(f'Numerical Shape: {X_num.shape}')
print(f'Label Shape: {y.shape}')

In [None]:
# combine num and cat
X = np.hstack((X_num, X_cat))
print(X.shape)

# PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Number of Compoenents for Cumulative Variance')

# Dimensionality Reduction

## Extra-Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc_model = ExtraTreesClassifier(random_state = 53) # pass the model
X = train.iloc[:,:-54] # feed features to var X
y = train['Cover_Type'] # feed target variable to y

etc_model.fit(X,y) # train the ETC model

# extract feature importances
etc_feature_importances = pd.DataFrame(etc_model.feature_importances_, index=X.columns,
                                      columns=['ETC']).sort_values('ETC', ascending=False)

etc_model = None # remove trace of this ETC model
etc_feature_importances.head(10)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier(random_state = 53) # pass the model
rfc_model.fit(X,y) # train the model

# extract feature importances
rfc_feature_importances = pd.DataFrame(rfc_model.feature_importances_, index=X.columns, 
                                       columns=['RFC']).sort_values('RFC', ascending=False)

rfc_model = None # remove trace of this RFC model
rfc_feature_importances.head(10)

## AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

adb_model = AdaBoostClassifier(random_state = 53) # pass the model
adb_model.fit(X,y) # train the model

# extract feature importances
adb_feature_importances = pd.DataFrame(adb_model.feature_importances_, index=X.columns,
                                      columns=['ADB']).sort_values('ADB', ascending=False)

adb_model = None # remove trace of this ADB model
adb_feature_importances.head(10)

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc_model = GradientBoostingClassifier(random_state = 53) # pass the model
gbc_model.fit(X,y) # train the model

# extract feature importances
gbc_feature_importances = pd.DataFrame(gbc_model.feature_importances_, index=X.columns,
                                      columns=['GBC']).sort_values('GBC', ascending=False)

gbc_model = None # remove trace of GBC model
gbc_feature_importances.head(10)

# Features Selection

We will add the features we found important, plus the new features we engineered.

In [None]:
sample = train[[
    'Elevation','Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology','Aspect','Slope','Euclidian_Distance_To_Hydrology',
    'Mean_Elevation_Vertical_Distance_Hydrology','Mean_Distance_Hydrology_Firepoints',
    'Mean_Distance_Hydrology_Roadways','Mean_Distance_Firepoints_Roadways','Cover_Type'
]]

# Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

# pass range to the function and then save it
scaler = MinMaxScaler(feature_range = (0,1))

X = sample.iloc[:,:-1] # feed sample features to X
y = sample['Cover_Type'] # feed target variable to y

X_scaled = scaler.fit_transform(X) # apply feature scaling to all features

In [None]:
X_scaled

# Model Evaluation

In [None]:
from sklearn.model_selection import cross_val_score
import time

# function
def model_evaluation(clf):
    clf = clf # pass classifier to variable
    
    t_start = time.time() # record time
    clf = clf.fit(X_scaled, y) # classifier learning model
    t_end = time.time() # record time
    
    c_start = time.time() # record time
    accuracy = cross_val_score(clf, X_scaled, y, cv=10, scoring='accuracy')
    f1_score = cross_val_score(clf, X_scaled, y, cv=10, scoring='f1_macro')
    c_end = time.time() # record time
    
    # calculate mean of all 10 obs' accuracy and f1 as percent
    acc_mean = np.round(accuracy.mean() * 100, 2)
    f1_mean = np.round(f1_score.mean() * 100, 2)
    
    t_time = np.round((t_end - t_start) / 60, 3) # time for training
    c_time = np.round((c_end - c_start) / 60, 3) # time for evaluating scores
    
    clf = None # remove traces of classifier
    
    print(f'The accuracy score of this classifier is: {acc_mean}%.')
    print(f'The f1 score of this classifier is: {f1_mean}%.')
    print(f'This classifier took {t_time} minutes to train and {c_time} minutes to evaluate CV and metric scores.')

## Benchmark Model: `MultinomialNB Classifier`
We will not see how the performance of `MultinomialNB Classifier` on given training data. This performs quite quickly, but has poor **precision** and **recall**.

In [None]:
from sklearn.naive_bayes import MultinomialNB

model_evaluation(MultinomialNB())

# Models

## 1. K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model_evaluation(KNeighborsClassifier(n_jobs=-1))

## 2. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_evaluation(RandomForestClassifier(n_jobs=-1, random_state=53))

## 3. Stochastic Gradient Descent Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
model_evaluation(SGDClassifier(n_jobs=-1, random_state=53))

## 4. Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model_evaluation(ExtraTreesClassifier(n_jobs=-1, random_state=53))

## 5. Logisitic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model_evaluation(LogisticRegression(n_jobs=-1, random_state=53, solver='saga', max_iter = 500))

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# number of trees in the forest algorithm
n_estimators = [50, 100, 300, 500, 1000]

# minimum number of samples required to split an internal node
min_samples_split = [2, 3, 5, 7, 9]

# minimum number of samples required to be at a leaf node
min_samples_leaf = [1, 2, 4, 6, 8]

# number of features to consider when looking for the best split
max_features = ['auto','sqrt','log2',None]

# define the grid of hyperparameters to search
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

# create model
best_model = ExtraTreesClassifier(random_state=42)

# create randomized search object
random_cv = RandomizedSearchCV(estimator=best_model, param_distributions=hyperparameter_grid, cv=10,
                               n_iter=20, scoring='accuracy', n_jobs=-1, verbose=1, return_train_score=True, random_state=0)

# fit on all training data using random search object
random_cv.fit(X_scaled, y)
random_cv.best_estimator_

# Train Final Model

In [None]:
from sklearn.metrics import accuracy_score, f1_score

clf = ExtraTreesClassifier(n_estimators=1000, random_state=42, max_features='log2') # best classifier
clf = clf.fit(X, y) # train model
predict = clf.predict(X) # predict unseen data
accuracy = accuracy_score(y, predict) # calculate accuracy
f1_score = f1_score(y, predict, average='macro') # calculate f1 score

accuracy = np.round(accuracy * 100, 3)
f1_score = np.round(f1_score * 100, 3)

clf = None # clean traces

print(f'The accuracy score of our final model ETC on our testing set is {accuracy}%.')
print(f'The f1 score of our final model ETC on our testing set is {f1_score}%.')