# Task for Today
***
 Predict the type of forest cover based on ecological data (Human influences are excluded within the features as much as possible)
 Treat the imbalance between output classes and fit it to Logistic Regression Model.

## Set Up

In [None]:
!pip install seaborn --upgrade

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/forest-cover-type-dataset/covtype.csv')

In [None]:
data

In [None]:
# Indexing the Class Labels from 0 rather than 1
data.Cover_Type = data.Cover_Type - 1

In [None]:
data.Cover_Type.value_counts()

## Visualize the Imbalanced Data: Class Distribution

In [None]:
cmap = sns.color_palette('Set2', as_cmap=True)(data.Cover_Type.unique()) # get color map from sns and initialize 7 values from it

plt.figure(figsize=(8,8))
plt.pie(
    data.Cover_Type.value_counts().values,
    colors=cmap,
    labels=data.Cover_Type.value_counts().keys(),
    autopct='%.2f%%',
)
plt.title("Forest Cover Type Distribution")
plt.show() 

## Some Helper Functions

In [None]:
def split_and_scale(df):
    df= df.copy()
    
    # Split df in X and y
    y = df.Cover_Type.copy()
    X = df.drop('Cover_Type', axis=1).copy()
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    # Scale X 
    sc = StandardScaler()
    sc.fit(X_train)
    
    # Transform fcn returns numpy array -> Turn back into dataframe
    X_train = pd.DataFrame(sc.transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
def evaluate_model(model, class_balance, X_test, y_test):
    model_acc = model.score(X_test, y_test)
    print("Accuracy ({}): {:.2f}%".format(class_balance, model_acc*100))
    
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    clr = classification_report(y_test, y_pred)
    
    plt.figure(figsize=(8,8))
    sns.heatmap(cm, annot=True, fmt='g', vmin=0, cbar=False, cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix Heatmap')
    plt.show()
    
    print('Classification Report: \n ------------------------------- \n', clr)

## Training with Imbalanced Data

In [None]:
imbalanced_data = data.copy()

X_train, X_test, y_train, y_test = split_and_scale(imbalanced_data)

In [None]:
model1 = LogisticRegression()
model1.fit(X_train, y_train)

In [None]:
evaluate_model(model1, 'Imbalaced Data', X_test, y_test)

## Training with UnderSampled Data

In [None]:
undersampled_data = data.copy()

In [None]:
undersampled_data.Cover_Type.value_counts()

In [None]:
min_class_size = min(undersampled_data.Cover_Type.value_counts().values)
print("Size of smallest class: ", min_class_size)

In [None]:
# Undersample all the majority classes
class_subsets = [undersampled_data.query('Cover_Type == ' + str(i)) for i in range(7)]

In [None]:
# All Classes will be downsized using below
for i in range(7):
    class_subsets[i] = class_subsets[i].sample(min_class_size, random_state=123)

In [None]:
# Combine all Subsets (row-wise), Shuffle the data using sample fcn, Reset the index and drop the old index
undersampled_data = pd.concat(class_subsets, axis=0).sample(frac=1.0, random_state=123).reset_index(drop=True)

In [None]:
undersampled_data

In [None]:
undersampled_data.Cover_Type.value_counts()

In [None]:
X_train, X_test, y_train, y_test = split_and_scale(undersampled_data)

model2 = LogisticRegression()
model2.fit(X_train, y_train)

In [None]:
evaluate_model(model2, 'Undersampled', X_test, y_test)

 ## Training with OverSampled Data

In [None]:
oversampled_data = data.copy()

max_class_size = max(oversampled_data.Cover_Type.value_counts().values)
max_class_size

In [None]:
# Make all subsets of Classes match the oversampled size
class_subsets_ov = [oversampled_data.query('Cover_Type == ' + str(i)) for i in range(7)]

for i in range(7):
    class_subsets_ov[i] = class_subsets_ov[i].sample(max_class_size, replace=True, random_state=123)

In [None]:
oversampled_data = pd.concat(class_subsets_ov, axis = 0).sample(frac=1).reset_index(drop=True)

In [None]:
oversampled_data.Cover_Type.value_counts()

In [None]:
X_train, X_test, y_train, y_test = split_and_scale(oversampled_data)

model3 = LogisticRegression()
model3.fit(X_train, y_train)

In [None]:
evaluate_model(model3, "Oversampled", X_test, y_test)