This is a tutorial i presented as an instructor of ML/DS at GDSC Enet'Com Tunisia

# (Extremely) Basic Exploration & Processing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use("fivethirtyeight")

In [None]:
data = pd.read_csv("../input/forest-cover-type-prediction/train.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.drop("Id", axis=1, inplace=True)

In [None]:
x, y = data.drop("Cover_Type", axis=1), data["Cover_Type"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.1, random_state=23)

In [None]:
categoricals = []
numericals = []
for col in x.columns:
    if col[:9]=="Soil_Type" or col[:15]=='Wilderness_Area':
        categoricals.append(col)
    else:
        numericals.append(col)

In [None]:
from sklearn.preprocessing import RobustScaler
ss = RobustScaler()

In [None]:
xtrain

In [None]:
xtrain[numericals] = ss.fit_transform(xtrain[numericals])
xtest[numericals] = ss.transform(xtest[numericals])

In [None]:
xtrain.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=12)
tree.fit(xtrain, ytrain)

In [None]:
from sklearn.metrics import classification_report

In [None]:
y_pred_tr = tree.predict(xtrain)
y_pred_ts = tree.predict(xtest)

In [None]:
print("Training Results:\n")
print(classification_report(ytrain, y_pred_tr))
print("\nTesting Results:\n")
print(classification_report(ytest, y_pred_ts))

# Cross-Validation

![](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

(Picture from the sklearn documentation)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cvscore = cross_val_score(tree, xtrain, ytrain, cv=10, scoring='accuracy')

In [None]:
cvscore

In [None]:
cvscore.mean()

In [None]:
cvscore.std()

# Stratified Cross-Validation

![](https://dataaspirant.com/wp-content/uploads/2020/12/8-Stratified-K-Fold-Cross-Validation.png)

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=15)

In [None]:
cvscore = cross_val_score(tree, xtrain, ytrain, cv=skf, scoring='accuracy')

In [None]:
cvscore

In [None]:
cvscore.mean()

In [None]:
cvscore.std()

In [None]:
sns.countplot(ytrain)

# Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
hyperparameters = {"max_depth":(5, 15, 30, 50, 65, 80), "max_features":(0.2, 0.4, 0.6, 0.8), "min_samples_split":(2, 5, 8, 10, 20)}

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=15)

#opt = GridSearchCV(tree, hyperparameters, cv=10, n_jobs=-1)

opt = GridSearchCV(estimator=tree, param_grid=hyperparameters, cv=skf, scoring="accuracy", n_jobs=-1)

In [None]:
opt.fit(xtrain, ytrain)

In [None]:
opt.best_estimator_

In [None]:
opt.best_score_

In [None]:
opt.best_params_

In [None]:
pd.DataFrame(opt.cv_results_)

In [None]:
all_results = pd.DataFrame(opt.cv_results_)
all_results[(all_results["param_max_depth"]==15) & (all_results["param_max_features"]==0.8) & (all_results["param_min_samples_split"]==2)]

# Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
hyperparameters = {"max_depth":(5, 15, 30, 50, 65, 80), "max_features":(0.2, 0.1, 0.4, 0.6, 0.8), "min_samples_split":(2, 5, 8, 10, 20)}

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=15)

In [None]:
opt = RandomizedSearchCV(estimator=tree, param_distributions=hyperparameters, n_iter=50, cv=skf, scoring="accuracy", n_jobs=-1)

In [None]:
opt.fit(xtrain, ytrain)

In [None]:
opt.best_estimator_

In [None]:
opt.best_score_

In [None]:
opt.best_params_

In [None]:
all_results = pd.DataFrame(opt.cv_results_)
all_results[(all_results["param_max_depth"]==15) & (all_results["param_max_features"]==0.8) & (all_results["param_min_samples_split"]==2)]