# **Data Overview**

**Elevation** - Elevation in meters

**Aspect** - Aspect in degrees azimuth

**Slope** - Slope in degrees

**Horizontal_Distance_To_Hydrology ** -** Horz Dist to nearest surface water features**

**Vertical_Distance_To_Hydrology** - Vert Dist to nearest surface water features

**Horizontal_Distance_To_Roadways** - Horz Dist to nearest roadway

**Hillshade_9am (0 to 255 index)** - Hillshade index at 9am, summer solstice

**Hillshade_Noon (0 to 255 index)** - Hillshade index at noon, summer solstice

**Hillshade_3pm (0 to 255 index)** - Hillshade index at 3pm, summer solstice

**Horizontal_Distance_To_Fire_Points** - Horz Dist to nearest wildfire ignition points

**Wilderness_Area (4 binary columns, 0 = absence or 1 = presence)** - Wilderness area designation

**Soil_Type (40 binary columns, 0 = absence or 1 = presence)** - Soil Type designation

**Cover_Type (7 types, integers 1 to 7)** - Forest Cover Type designation

# **Importing Library**

In [None]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,log_loss
from sklearn.metrics import plot_confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn import svm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import math

warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression



# **Reading Forest Cover Data**

In [None]:
data=pd.read_csv('../input/forest-cover-type-dataset/covtype.csv')
print("number of datapoints",data.shape[0])
print("number of features",data.shape[1])
data.head()

In [None]:
data.describe()

# **Check for missing values & variable types**

In [None]:
data.info()

In [None]:
data[data.isnull().any(axis=1)]

In [None]:
data.isnull().sum()

# **Exploratory Data Analysis**

In [None]:
data['Cover_Type']=data['Cover_Type']-1
data['Cover_Type'].value_counts()

In [None]:
data.Cover_Type.unique()

In [None]:
ax = sns.countplot(x="Cover_Type", data=data)

In [None]:
data.corr()["Cover_Type"]

In [None]:
plt.hist(data["Cover_Type"])


In [None]:
data.corr()["Cover_Type"].plot(kind="bar")


In [None]:
df1=data.iloc[:,0:14]
df2=data['Cover_Type']
df1=df1.join(df2)
df1.head()

In [None]:
df1.corr()

In [None]:
fig = plt.subplots(figsize=(10,10))
sns.heatmap(df1.corr(),vmax=0.5,square=True,annot=True,cmap='Blues')
plt.xticks(rotation=90)
plt.yticks(rotation=0)

In [None]:
fig,axs=plt.subplots(ncols=3)
sns.boxplot(x='Cover_Type',y='Elevation',data=data,ax=axs[0])#highest in 1 & 7 lowest in 4
sns.boxplot(x='Cover_Type',y='Aspect',data=data,ax=axs[1])
sns.boxplot(x='Cover_Type',y='Slope',data=data,ax=axs[2])

# **RANDOM FOREST CLASSIFIER WITH HYPERPARAMETER TUNING**

In [None]:
X=data.drop('Cover_Type',axis=1)
y=data['Cover_Type']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)
train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)

In [None]:
alpha = [100,200]
max_depth = [5, 10]
cv_log_error_array = []
for i in alpha:
    for j in max_depth:
        print("for n_estimators =", i,"and max depth = ", j)
        clf = RandomForestClassifier(n_estimators=i, criterion='gini', max_depth=j, random_state=42, n_jobs=-1)
        clf.fit(train_df,y_train)
        sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
        sig_clf.fit(train_df,y_train)
        sig_clf_probs = sig_clf.predict_proba(cv_df)
        cv_log_error_array.append(log_loss(y_cv, sig_clf_probs, labels=clf.classes_, eps=1e-15))
        print("Log Loss :",log_loss(y_cv, sig_clf_probs)) 


best_alpha = np.argmin(cv_log_error_array)
clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/2)], criterion='gini', max_depth=max_depth[int(best_alpha%2)], random_state=42, n_jobs=-1)
clf.fit(train_df, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_df,y_train)

predict_y = sig_clf.predict_proba(train_df)
print('For values of best estimator = ', alpha[int(best_alpha/2)], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_df)
print('For values of best estimator = ', alpha[int(best_alpha/2)], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_test)
print('For values of best estimator = ', alpha[int(best_alpha/2)], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))

**Training and Testing the model with best hyperparameter - RF**

In [None]:
clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/2)], criterion='gini', max_depth=max_depth[int(best_alpha%2)], random_state=42, n_jobs=-1)
clf.fit(train_df,y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_df,y_train)
pred_y = sig_clf.predict(X_test)

In [None]:
accuracy_score(y_test, pred_y)

In [None]:
cf_matrix=confusion_matrix(y_test,pred_y)
print(cf_matrix)

In [None]:
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(cf_matrix,annot=True)

# **Standadization**

In [None]:
X=data.drop('Cover_Type',axis=1)
y=data['Cover_Type']

In [None]:
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(X),columns=X.columns)

# **Principal Component Analysis**

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=13)
principle=pca.fit_transform(x)


In [None]:
x=pd.DataFrame(data=principle,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10','pca11','pca12','pca13'])
x.head()

# **Train Test and Cross Validation**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.2,random_state=42)
train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)

In [None]:
x.head()

# **KNearest Neighbors With Hyperparameter Tuning**

In [None]:
alpha = [5, 11, 15, 21, 31, 41, 51, 99]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(train_df,y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_df,y_train)
    sig_clf_probs = sig_clf.predict_proba(cv_df)
    cv_log_error_array.append(log_loss(y_cv, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(y_cv, sig_clf_probs)) 

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()


best_alpha = np.argmin(cv_log_error_array)
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
clf.fit(train_df, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_df,y_train)

predict_y = sig_clf.predict_proba(train_df)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_df)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_test)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))


**Training and Testing the model with best hyper paramters -KNN**

In [None]:
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
clf.fit(train_df,y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_df,y_train)
pred_y = sig_clf.predict(X_test)

In [None]:
accuracy_score(y_test, pred_y)

In [None]:
cf_matrix=confusion_matrix(y_test,pred_y)
print(cf_matrix)

In [None]:
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(cf_matrix,annot=True)

# **Support Vector Machine With Hyperparameter Tuning**

In [None]:
from sklearn.linear_model import SGDClassifier
alpha = [1, 10, 100, 1000]
cv_log_error_array = []
for i in alpha:
    print("for C =", i)
    clf = SGDClassifier( class_weight='balanced', alpha=i, penalty='l2', loss='hinge', random_state=42)
    clf.fit(train_df,y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_df,y_train)
    sig_clf_probs = sig_clf.predict_proba(cv_df)
    cv_log_error_array.append(log_loss(y_cv, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    print("Log Loss :",log_loss(y_cv, sig_clf_probs)) 

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()


best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='hinge', random_state=42)
clf.fit(train_df, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_df,y_train)

predict_y = sig_clf.predict_proba(train_df)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_df)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(X_test)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))

**Training and testing the model with best hyperparameter - SVM**

In [None]:
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='hinge', random_state=42)
clf.fit(train_df,y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_df,y_train)
pred_y = sig_clf.predict(X_test)

In [None]:
accuracy_score(y_test, pred_y)

In [None]:
cf_matrix=confusion_matrix(y_test,pred_y)
print(cf_matrix)

In [None]:
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(cf_matrix,annot=True)