In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
from warnings import filterwarnings as filt
from scipy.stats import skew, norm 

filt('ignore')
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12,6)
pd.options.display.max_columns = None

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/forest-cover-type-dataset/covtype.csv')
df.shape

In [None]:
df.head()

In [None]:
cvtp = pd.DataFrame(df.Cover_Type.value_counts()).sort_values('Cover_Type', ascending = False)
cvtp['ct_per'] = cvtp.Cover_Type / df.shape[0]
sns.countplot(df.Cover_Type)

In [None]:
cvtp

In [None]:
df.isnull().values.sum()

In [None]:
df.describe()

In [None]:
soil = df[[f"Soil_Type{i}" for i in range(1,41)]] # types of soils
wilderness = df[[f"Wilderness_Area{i}" for i in range(1,5)]]
cont_feats = df[[c for c in df.columns if c not in soil.columns and c not in wilderness.columns]].iloc[:, :-1]

In [None]:
# these 4 diff wilderness area corresponds to 4 different areas 
fig, ax = plt.subplots(2, 2)
fig.tight_layout()
sns.countplot(wilderness.Wilderness_Area1, ax = ax[0,0])
sns.countplot(wilderness.Wilderness_Area2, ax = ax[0,1])
sns.countplot(wilderness.Wilderness_Area3, ax = ax[1,0])
sns.countplot(wilderness.Wilderness_Area4, ax = ax[1,1])

In [None]:
skewness = pd.DataFrame(skew(df), columns = ['skews'], index = df.columns).sort_values('skews', ascending = True)
plt.figure(figsize = (12,10))
plt.barh(skewness.index, skewness.skews)

In [None]:
print(df.Soil_Type7.value_counts())
sns.distplot(df.Soil_Type15)

In [None]:
low_soil = []
for s in soil.columns:
#     ones = df.shape[0] - soil[s].value_counts().iloc[0]
    ones = soil[s].value_counts()[1]
    if  ones <  1000:
        print(f"{s} ones count : {ones}    {round(ones / df.shape[0], 4)} %")
        low_soil.append(s)

In [None]:
fig, ax = plt.subplots(cont_feats.shape[1], 2, figsize = (18,16))
fig.tight_layout()
for row, col in enumerate(cont_feats.columns):
    sns.distplot(df[col], ax = ax[row, 0])
    sns.boxplot(df[col], ax = ax[row, 1])

In [None]:
sns.heatmap(cont_feats.corr(), fmt = '.1f', annot = True, cmap = 'icefire')

In [None]:
sns.scatterplot(data = df, x = 'Hillshade_3pm', y = 'Aspect', hue = 'Cover_Type')

In [None]:
from eli5 import show_weights
from eli5.sklearn import PermutationImportance
from sklearn.feature_selection import mutual_info_classif
from pdpbox.pdp import *
from sklearn.ensemble import RandomForestClassifier
import shap

In [None]:
def sample(x, y, frac = 0.005):
#     x = x.sample(frac = frac)
#     y = y.loc[x.index]
    x_big, x, y_big, y = train_test_split(x, y, test_size = frac, stratify = y)
    return x, y

def permImp(x, y, frac = 0):
    if frac > 0:
        x, y = sample(x, y)
    model = RandomForestClassifier().fit(x, y)
    perm = PermutationImportance(model).fit(x, y)
    return show_weights(perm, feature_names = x.columns.tolist())

def plot_mi(score):
    score = score.sort_values('mi_score', ascending = True)
    plt.barh(score.index, score.mi_score)
    plt.title('mutual info clf')
    return 

def mi_score(x, y, frac = 0):
    if frac > 0:
        x, y = sample(x, y)
    score = pd.DataFrame(mutual_info_classif(x, y, discrete_features = False), columns = ['mi_score'], index = x.columns ).sort_values('mi_score', ascending = False)
    plot_mi(score)
    return score

def isolate(x, y, col, frac = 0):
    if frac > 0:
        x, y = sample(x, y)
    model = RandomForestClassifier().fit(x, y)
    pdp_dist = pdp_isolate(model, model_features = x.columns, dataset = x, feature = col)
    return pdp_plot(pdp_dist, feature_name = col, ncols = 3)

def interact(x, y, cols, frac = 0):
    if frac > 0:
        x, y = sample(x, y)
    model = RandomForestClassifier().fit(x, y)
    pdp_dist = pdp_interact(model, model_features = x.columns, dataset = x, features = cols)
    return pdp_interact_plot(pdp_dist, feature_names = cols)

def forceplot(x, y, classes = 1, frac = 0):
    if frac > 0:
        x, y = sample(x, y)
    classes -= 1
    x_samp = x.sample(n = 1)
    x = x.drop(x_samp.index)
    y = y.drop(x_samp.index)
    model = RandomForestClassifier().fit(x, y)
    explainer = shap.TreeExplainer(model)
    exp_values = explainer.expected_value[classes]
    shap_values = explainer.shap_values(x_samp)[classes]
    return shap.force_plot(exp_values, shap_values, feature_names = x.columns.tolist() )

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df.drop(['Cover_Type'], axis = 1)
y = df.Cover_Type
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.05, random_state = 123, stratify = y)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

lets see the important features

In [None]:
x_train.shape[0] * 0.05

In [None]:
permImp(x_train, y_train, 0.05)

In [None]:
plt.figure(figsize = (13,15))
mscore = mi_score(x_train, y_train, 0.05)

from the permutation and mutual info plot, looks like elevation is most important feature

In [None]:
for ms in mscore[mscore.mi_score == 0].index:
    if ms in low_soil:
        print(ms)
        
no_info_feats = mscore[mscore.mi_score == 0].index
useless_feats = set(np.concatenate([no_info_feats, low_soil]))
' , '.join(useless_feats)

In [None]:
isolate(x_train, y_train, 'Elevation',0.05);

from this PD plot:

* for cover type 1, chances are super high if the elevation is between 3000 - 3250
* for cover type 2 and 7, chances are super low if the elevation is above 3000
* for cover type 3, 4 and 6 , chances are low if the elevation is greater than 2000
* for cover type 5, chance are little bit high if the elevation is anywhere between 2000 - 3000 

In [None]:
interact(x_train, y_train, ['Elevation', 'Horizontal_Distance_To_Roadways'], 0.05);

from this plot we can interept more insights , but lets only focus on cover type - 4 and 5 since they are the least occuring cover type :

* cover type 4 are elevated max of ~2100
* cover type 4 are little distant from the roadways 
* cover type 5 trees are usually near road ways 
* cover type 5 elevation is max of ~2800 

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [None]:
# let's not forget we do have these useless feats
useless_feats

In [None]:
def best_model(x, y, frac = 0, fold = 10):
    if frac > 0:
        x, y = sample(x, y, frac)
        
    models = [SVC(), KNeighborsClassifier(), RandomForestClassifier(), LGBMClassifier()]
    mnames = ['svm', 'knn', 'random forest', 'lgbm']
    scalers = [None, StandardScaler(), RobustScaler(), MinMaxScaler()]
    snames = ['none', 'std', 'robust', 'min-max']
    scores = [[] for _ in range(4)]
    
    print(f'Total iterations : {len(models) * len(scalers)}')
    iterr = 0
    for model in models:
        for ind, scaler in enumerate(scalers):
            iterr += 1
            print(f'iteration :===> {iterr} / {len(models) * len(scalers)}')
            if scaler:
                model = Pipeline(steps = [('scaler', scaler), ('model', model)])
            kf = KFold(n_splits = fold, shuffle = True)
            score = cross_val_score(model, x, y, cv = kf, scoring = 'f1_micro').mean()
            scores[ind].append(score)
    
    return pd.DataFrame(scores, columns = mnames, index = snames).T

def get_score(xt, yt, xtest, ytest, model, scaler = None, frac = 0):
    if frac > 0:
        xt, yt = sample(xt, yt, frac)
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    model.fit(xt, yt)
    pred = model.predict(xtest)
    print(' Reports '.center(70, '='))
    print()
    print(f"Training acc score : {model.score(xt, yt)}")
    print(f"Testing acc score  : {model.score(xtest, ytest)}")
    print()
    print(classification_report(ytest, pred))
    print()
    sns.heatmap(confusion_matrix(ytest, pred), fmt = '.1f', annot = True)

def gridcv(x, y, model, params, scaler = None ,frac = 0, fold = 10):
    if frac > 0:
        x, y = sample(x, y, frac)
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    kf = KFold(n_splits = fold, shuffle = True)
    clf = GridSearchCV(model, param_grid = params, scoring = 'f1_micro', return_train_score = True, cv = kf)
    clf.fit(x, y)
    res = pd.DataFrame(clf.cv_results_).sort_values('mean_test_score', ascending = False)
    return clf.best_estimator_, clf.best_params_, res[['mean_train_score', 'mean_test_score', 'params']]

def plot_cv(report):
    sns.lineplot(x = report.reset_index().index, y = report.mean_train_score )
    sns.lineplot(x = report.reset_index().index, y = report.mean_test_score )
    plt.legend(['training score', 'testing score'])
    plt.title('f1_micro score on training and testing')

In [None]:
x_train.shape[0] * 0.05

In [None]:
%%time
get_score(x_train, y_train, x_test, y_test, SVC(), RobustScaler(), 0.05)

it is doing pretty good, lets try to get the best model

In [None]:
%%time
best_model(x_train, y_train, 0.05, 3)

random forest did pretty good at classifying the cover types , pretty ironic forest model is good at classifying forest covers 

In [None]:
%%time
get_score(x_train, y_train, x_test, y_test, RandomForestClassifier(), RobustScaler())

In [None]:
params = {
    'n_estimators' : [50,75,100],
    'max_depth' : [None, 8, 15],
    'class_weight' : [None, 'balanced']
}

pip_params = { f"model__{key}" : values for key, values in params.items()}
pip_params

In [None]:
%%time
clf, best_param, results = gridcv(x_train, y_train, RandomForestClassifier(), pip_params, RobustScaler(), 0.05)

In [None]:
plot_cv(results)

In [None]:
results.iloc[0], best_param

In [None]:
%%time
get_score(x_train, y_train, x_test, y_test, clf)

In [None]:
shap.initjs()
forceplot(x_train, y_train,1, 0.05)

from this force plot, looks like some soil types are not really helping the model, let's try dropping them useless feats 

In [None]:
plot_mi(mscore.loc[useless_feats])

In [None]:
%%time
get_score(x_train.drop(useless_feats, axis = 1), y_train, x_test.drop(useless_feats, axis = 1), y_test, clf)

wow that's a nice improvement