# Task for Today  

***

## Forest Cover Type Prediction  

Given *data about trees and forests*, let's try to predict the **cover type** of a given forest.  
  
We will use a logistic regression model to make our predictions, but first we have to deal with the imbalanced classes.

# Getting Started

In [None]:
!pip install seaborn --upgrade

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
data = pd.read_csv('../input/forest-cover-type-dataset/covtype.csv')

In [None]:
data

In [None]:
# Start class labels from 0 rather than 1
data['Cover_Type'] = data['Cover_Type'] - 1

# Imbalanced Data: Class Distribution

In [None]:
data['Cover_Type'].value_counts()

In [None]:
cmap = sns.color_palette('Set2', as_cmap=True)(np.arange(7))

plt.figure(figsize=(8, 8))
plt.pie(
    data['Cover_Type'].value_counts().values,
    colors=cmap,
    labels=data['Cover_Type'].value_counts().keys(),
    autopct='%.1f%%'
)
plt.title("Class Distribution")
plt.show()

# Some Helper

In [None]:
def split_and_scale(df):
    df = df.copy()
    
    # Split df into X and y
    y = df['Cover_Type'].copy()
    X = df.drop('Cover_Type', axis=1).copy()
    
    # Train-test-split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
def evaluate_model(model, class_balance, X_test, y_test):
    
    model_acc = model.score(X_test, y_test)
    print("Accuracy ({}): {:.2f}%".format(class_balance, model_acc * 100))
    
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    clr = classification_report(y_test, y_pred)
    
    plt.figure(figsize=(8, 8))
    sns.heatmap(cm, annot=True, fmt='g', vmin=0, cbar=False, cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

# Training (Imbalanced)

In [None]:
imbalanced_data = data.copy()

X_train, X_test, y_train, y_test = split_and_scale(imbalanced_data)

In [None]:
model1 = LogisticRegression()
model1.fit(X_train, y_train)

In [None]:
evaluate_model(model1, "Imbalanced", X_test, y_test)

# Training (Undersampling)

In [None]:
undersampled_data = data.copy()

In [None]:
undersampled_data['Cover_Type'].value_counts()

In [None]:
min_class_size = np.min(undersampled_data['Cover_Type'].value_counts().values)

print("Size of smallest class:", min_class_size)

In [None]:
# Undersampling the majority classes
class_subsets = [undersampled_data.query("Cover_Type == " + str(i)) for i in range(7)]

for i in range(7):
    class_subsets[i] = class_subsets[i].sample(min_class_size, replace=False, random_state=123)

undersampled_data = pd.concat(class_subsets, axis=0).sample(frac=1.0, random_state=123).reset_index(drop=True)

In [None]:
undersampled_data

In [None]:
undersampled_data['Cover_Type'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = split_and_scale(undersampled_data)

In [None]:
model2 = LogisticRegression()
model2.fit(X_train, y_train)

In [None]:
evaluate_model(model2, "Undersampling", X_test, y_test)

# Training (Oversampling)

In [None]:
oversampled_data = data.copy()

In [None]:
oversampled_data['Cover_Type'].value_counts()

In [None]:
max_class_size = np.max(oversampled_data['Cover_Type'].value_counts().values)

print("Size of largest class:", max_class_size)

In [None]:
# Oversampling the minority classes
class_subsets = [oversampled_data.query("Cover_Type == " + str(i)) for i in range(7)]

for i in range(7):
    class_subsets[i] = class_subsets[i].sample(max_class_size, replace=True, random_state=123)

oversampled_data = pd.concat(class_subsets, axis=0).sample(frac=1.0, random_state=123).reset_index(drop=True)

In [None]:
oversampled_data

In [None]:
oversampled_data['Cover_Type'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = split_and_scale(oversampled_data)

In [None]:
model3 = LogisticRegression()
model3.fit(X_train, y_train)

In [None]:
evaluate_model(model3, "Oversampling", X_test, y_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/y70bUYIPe2A