![](https://storage.googleapis.com/kaggle-competitions/kaggle/28009/logos/header.png?)

# Overview


The study area includes four wilderness areas located in the Roosevelt National Forest of northern Colorado. Each observation is a 30m x 30m patch. You are asked to predict an integer classification for the forest cover type. The seven types are:

* 1 - Spruce/Fir
* 2 - Lodgepole Pine
* 3 - Ponderosa Pine
* 4 - Cottonwood/Willow
* 5 - Aspen
* 6 - Douglas-fir
* 7 - Krummholz

See: https://www.kaggle.com/c/forest-cover-type-prediction/data

### Files


* `train.csv` - the training data with the target ``Cover_Type` column
* `test.csv` - the test set; you will be predicting the `Cover_Type` for each row in this file (the target integer class)
* `sample_submission.csv` - a sample submission file in the correct format



# Setup

In [None]:
import warnings

warnings.filterwarnings('ignore', 'SettingWithCopyWarning')
warnings.filterwarnings('ignore', 'UndefinedMetricWarning')
warnings.filterwarnings('ignore', 'ConvergenceWarning')

In [None]:
import os
import random
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display, Markdown, Latex

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")

In [None]:
class Cfg:
    RANDOM_STATE = 2021
    TRAIN_DATA = '../input/tabular-playground-series-dec-2021/train.csv'
    TEST_DATA = '../input/tabular-playground-series-dec-2021/test.csv'
    SUBMISSION = '../input/tabular-playground-series-dec-2021/sample_submission.csv'    
    SUBMISSION_FILE = 'submission.csv'
    TEST_SIZE = 0.6
    SAMPLE_FRAC = 0.3
    INDEX = 'Id'
    TARGET = 'Cover_Type'
    
    @staticmethod
    def set_seed():
        random.seed(Cfg.RANDOM_STATE)
        np.random.seed(Cfg.RANDOM_STATE)

Cfg.set_seed()

# Read data

In [None]:
def read_data(
    train_file:str=Cfg.TRAIN_DATA, 
    test_file:str=Cfg.TEST_DATA
) -> (pd.DataFrame, pd.DataFrame):
    """Reads the train and test data files
    """
    # read csv files
    train_df = pd.read_csv(train_file).set_index(Cfg.INDEX).astype(np.int32)
    test_df = pd.read_csv(test_file).set_index(Cfg.INDEX).astype(np.int32)
    
    return train_df, test_df

In [None]:
%%time
train_data, test_data = read_data()

In [None]:
train_data

In [None]:
test_data

In [None]:
print('Train data: {} rows'.format(len(train_data)))
print('Test data: {} rows'.format(len(test_data)))

print('Train data: {} columns'.format(len(train_data.columns)))

In [None]:
Cfg.NUM_FEATURES = [
    'Elevation', 
    'Aspect', 
    'Slope', 
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways', 
    'Hillshade_9am',
    'Hillshade_Noon', 
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points', 
]
    
Cfg.BINARY_FEATURES = [
    'Wilderness_Area1',
    'Wilderness_Area2', 
    'Wilderness_Area3', 
    'Wilderness_Area4',
    'Soil_Type1', 
    'Soil_Type2', 
    'Soil_Type3', 
    'Soil_Type4',
    'Soil_Type5', 
    'Soil_Type6', 
    'Soil_Type7', 
    'Soil_Type8',
    'Soil_Type9', 
    'Soil_Type10', 
    'Soil_Type11', 
    'Soil_Type12',
    'Soil_Type13', 
    'Soil_Type14', 
    'Soil_Type15', 
    'Soil_Type16',
    'Soil_Type17', 
    'Soil_Type18', 
    'Soil_Type19', 
    'Soil_Type20',
    'Soil_Type21', 
    'Soil_Type22', 
    'Soil_Type23', 
    'Soil_Type24',
    'Soil_Type25', 
    'Soil_Type26', 
    'Soil_Type27', 
    'Soil_Type28',
    'Soil_Type29', 
    'Soil_Type30', 
    'Soil_Type31', 
    'Soil_Type32',
    'Soil_Type33', 
    'Soil_Type34', 
    'Soil_Type35', 
    'Soil_Type36',
    'Soil_Type37', 
    'Soil_Type38', 
    'Soil_Type39', 
    'Soil_Type40'
]

Cfg.FEATURES = Cfg.NUM_FEATURES + Cfg.BINARY_FEATURES

In [None]:
print(f'Numerical Features: {len(Cfg.NUM_FEATURES)}')
print(f'Categorical Features: {len(Cfg.BINARY_FEATURES)}')

### Notice

* The training data contains 4,000,000 rows.

* The test data contains 1,000,000 rows.

* There are 54 features

    * 10 numerical features
    * 44 categorical features (All binary - 1/0).


* The target `Cover_Type` is a multi-label variable.

# Missing values

In [None]:
pd.DataFrame({
    'data_set': ['train', 'test'],
    'missing_values': [
        train_data.isna().sum().sum(), 
        test_data.isna().sum().sum()
    ]
}).set_index('data_set')

### Notice

* There are no missing values in both data sets.

# Exploratory data analysis (EDA)

In [None]:
def get_sample_data(
    data,
    split_target=True,
    features=Cfg.FEATURES,
    target=Cfg.TARGET,
    frac=Cfg.SAMPLE_FRAC, 
    random_state=Cfg.RANDOM_STATE):
    """Select a sample subset from data
    """
    idx = data.sample(frac=frac, random_state=random_state).index

    if split_target:
        X_data = data.iloc[idx][features]
        y_data = data.iloc[idx][target]
    
        return X_data, y_data
    
    return train_data.iloc[idx]

## Target variable

In [None]:
def plot_count(
    data:pd.DataFrame, 
    feature:str, 
    title='Countplot',
    ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    
    sns.countplot(
        data=train_data,
        x=feature,
        palette='Blues_r',
        ax=ax)
    
    ax.set_title(title)

    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Count')

    return ax

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))

plot_count(train_data, Cfg.TARGET, title='Target countplot', ax=ax)
plt.show()

In [None]:
train_data[[Cfg.TARGET]].reset_index().groupby(by='Cover_Type').count()

### Notes

* The target `Cover_Type` has seven different classes.

## Features

## Numerical features

In [None]:
def plot_pdf(
    data:pd.DataFrame, 
    feature:str, 
    title='Histplot',
    bins=70,
    ax=None):
    """ Plots the estimated pdf. 
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    # plot pdf
    sns.histplot(
        data=data[[feature, Cfg.TARGET]],
        x=feature,
        hue=Cfg.TARGET,
        bins=bins,
        palette='Blues_r',
        legend=True,
        kde=False,
        ax=ax
    )
    mean = np.mean(data[feature])
    ax.vlines(
        mean, 0, 1, 
        transform=ax.get_xaxis_transform(), 
        color='red', ls=':')
    
    ax.set_title(title)
    
    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Count')
    
    return ax

In [None]:
def plot_boxplot(
    data:pd.DataFrame, 
    feature:str, 
    title='Boxplot',
    ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    ax = sns.boxplot(
        x=Cfg.TARGET, 
        y=feature,
        palette='Blues_r',
        data=data
    )
    
    ax.set_title(title)
    
    ax.set_xlabel('Target {}'.format(Cfg.TARGET))
    ax.set_ylabel('Feature {}'.format(feature))
    
    return ax

In [None]:
stat_data = train_data[Cfg.NUM_FEATURES].describe().drop('count')
stat_data.loc['var'] = stat_data.T['std']**2

stat_data.T.style.bar(
    subset=['mean'], 
    color='Bules'
).background_gradient(subset=['50%'], cmap='Blues')

In [None]:
X_data = get_sample_data(train_data, split_target=False)
for feature in Cfg.NUM_FEATURES:
    display(Markdown('### Feature `{}`'.format(feature)))
 
    info = np.round(train_data[feature].describe(), 4)
    
    format_str = '* mean: {}\n* std: {}\n* min: {}\n* 25%: {}' \
        + '\n* 50%: {}\n* 75%: {}\n* max: {}'
        
    display(Markdown(format_str.format(info['mean'], info['std'], 
        info['min'], info['25%'], info['50%'], info['75%'], info['max'])))
    
    fig, ax = plt.subplots(1, 2, figsize=(13, 5))

    plot_pdf(X_data, feature, ax=ax[0])
    plot_boxplot(X_data, feature, ax=ax[1])
    
    plt.show()

In [None]:
data = train_data[Cfg.NUM_FEATURES + [Cfg.TARGET]].sample(frac=0.001) 
grid = sns.pairplot(
    data, 
    hue=Cfg.TARGET,
    palette='Blues_r',
    corner=True)

for ax in grid.axes.flatten():
    if ax is not None:
        ax.set_xlabel(ax.get_xlabel(), rotation=45)
        ax.set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
corr_matrix = train_data[Cfg.NUM_FEATURES].corr()

plt.figure(figsize = (15, 15))
sns.heatmap(
    corr_matrix, 
    annot = True, 
    cmap = 'Blues_r', 
    mask = np.triu(corr_matrix), 
    linewidths = 0.1, 
    linecolor = 'white',
    cbar = True
)

plt.tight_layout()
plt.show()

## Binary features

In [None]:
def plot_stacked_bar(
    data:pd.DataFrame, 
    feature:str, 
    title='Feature by Target',
    ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=(15, 5))

    pd.crosstab(
        index=data[Cfg.TARGET], 
        columns=data[feature]
    ).plot(
        kind="bar", 
        color=['steelblue', 'darkblue'],
        stacked=True, ax=ax)

    plt.xticks(rotation=0)
    plt.show()

    ax.set_title(title)
    return ax

In [None]:
train_data[Cfg.BINARY_FEATURES].astype(object).describe().drop('count').T

In [None]:
for feature in Cfg.BINARY_FEATURES:
    display(Markdown('### Feature `{}`'.format(feature)))
    
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    plot_stacked_bar(train_data, feature, ax=ax)

    plt.show()

### Note

* The two features `Soil_Type15` and `Soil_Type7` each have only a single value and can be removed without loss of information.

# Feature importance 

In [None]:
def plot_feature_importances(feature_imp, feature_names, num=20, ax=None):
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    df = pd.DataFrame({
        'feature': feature_names,
        'value': feature_imp
    }).sort_values('value', ascending=False).head(num)
    
    sns.barplot(
        x='value', 
        y='feature', 
        palette='Blues_r',
        data=df,
        ax=ax
    ) 

    ax.set_title("Importance of each feature")
    ax.set_xlabel("Score")
    ax.set_ylabel("Features")

    return ax

In [None]:
from sklearn.ensemble import RandomForestClassifier

X_data, y_data = get_sample_data(train_data, frac=0.01)
rf = RandomForestClassifier(random_state=Cfg.RANDOM_STATE)

rf.fit(X_data, y_data)
feature_imp = rf.feature_importances_

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))
plot_feature_importances(feature_imp, Cfg.FEATURES, num=30, ax=ax)

fig.tight_layout()
plt.show()

In [None]:
feature_importance_score = pd.DataFrame({
    'feature': Cfg.FEATURES,
    'score': feature_imp
}).sort_values(by='score', ascending=False).set_index('feature')

feature_importance_score.head(15)

# Modeling


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import confusion_matrix

In [None]:
def split_data(data, frac=Cfg.SAMPLE_FRAC):
    X_data, y_data = get_sample_data(data, frac=frac)

    # spit data into train and validation data sets
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_data,
        y_data,
        test_size=Cfg.TEST_SIZE, 
        random_state=Cfg.RANDOM_STATE
    )
    return X_train, X_valid, y_train, y_valid

In [None]:
from sklearn import metrics

def plot_confusion_matrix(y_true, y_pred, ax=None):
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=(8, 8))

    labels = range(1, 8)
    cm = np.round(confusion_matrix(
        y_true, 
        y_pred, 
        labels=labels,
        normalize='true'), 2)
    
    sns.heatmap(
        cm, 
        cmap=plt.cm.Blues,
        annot=True,
        xticklabels=labels,
        yticklabels=labels,
        ax=ax)

In [None]:
def plot_model_proba(proba, ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=(8, 5))

    sns.histplot(
        data=proba,
        palette='Blues_r',
        stat='probability',
        legend=True,
        bins=100,
        kde=False,
        ax=ax
    )

    ax.set_xlabel('Prediction probapility')
    ax.set_ylabel('Probabitity')

In [None]:
def plot_result(y_true, y_pred, y_proba):
    fig, ax = plt.subplots(1, 2, figsize=(13, 5))

    plot_model_proba(y_proba, ax=ax[0])
    plot_confusion_matrix(y_true, y_pred, ax=ax[1])

    plt.tight_layout()
    plt.show()    

In [None]:
# target encoder
target_encoder = LabelEncoder()
target_encoder.fit(data[Cfg.TARGET])

target_encoder.classes_

In [None]:
def model_result(model, data, target_encoder=target_encoder, frac=Cfg.SAMPLE_FRAC):
    """
    """
    # split data
    X_train, X_valid, y_train, y_valid = split_data(data, frac=frac)
    
    # train model
    model.fit(X_train, target_encoder.transform(y_train))

    # make predictions
    y_pred = target_encoder.inverse_transform(model.predict(X_valid))
    y_proba = model.predict_proba(X_valid)
    
    # display results
    plot_result(y_valid, y_pred, y_proba)
    
    print(classification_report(y_valid, y_pred))
    
    # display data size
    print(f'train size: {X_train.shape[0]} rows')
    print(f'valid size: {X_valid.shape[0]} rows')

In [None]:
def create_preprocessor():
    num_transformer = make_pipeline(
        StandardScaler()
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, Cfg.NUM_FEATURES),
        ], remainder='passthrough')
    
    return preprocessor

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import StackingClassifier, VotingClassifier

## LogisticRegression

In [None]:
%%time

model = LogisticRegression(solver='saga')
log_model = make_pipeline(
    create_preprocessor(),
    model
)

model_result(log_model, train_data)

## LinearDiscriminant

In [None]:
%%time

model = LinearDiscriminantAnalysis(solver='lsqr')
lda_model = make_pipeline(
    create_preprocessor(),
    model
)

model_result(lda_model, train_data)

## DecisionTreeClassifier

In [None]:
%%time

model = DecisionTreeClassifier(max_depth=20)
dt_model = make_pipeline(
    create_preprocessor(),
    model
)

model_result(dt_model, train_data)

## RandomForest

In [None]:
%%time

model = RandomForestClassifier(n_estimators=100, max_depth=5)
rf_model = make_pipeline(
    create_preprocessor(),
    model
)

model_result(rf_model, train_data)

### AdaBoost

In [None]:
%%time

model = AdaBoostClassifier(n_estimators=100)
ada_model = make_pipeline(
    create_preprocessor(),
    model
)

model_result(ada_model, train_data)

## SGDClassifier

In [None]:
%%time

model = SGDClassifier(loss='log')
sgd_model = make_pipeline(
    create_preprocessor(),
    model
)

model_result(sgd_model, train_data)

### XGB

In [None]:
%%time

model = XGBClassifier(
    n_estimators=100,
    eval_metric='rmse',
    random_state=Cfg.RANDOM_STATE)

xgb_model = make_pipeline(
    create_preprocessor(),
    model
)

model_result(xgb_model, train_data)

## LGBMClassifier

In [None]:
%%time

model = lgb.LGBMClassifier(
    learning_rate=0.05,
    n_estimators=100,
    reg_lambda = 1)

lgbm_model = make_pipeline(
    create_preprocessor(),
    model
)

model_result(lgbm_model, train_data)

## StackingClassifier

In [None]:
estimators = [
    ('dt',  DecisionTreeClassifier(max_depth=10)),
    ('log', LogisticRegression(solver='saga')),
    ('lda', LinearDiscriminantAnalysis(solver='lsqr')),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=5)),
    ('ada', AdaBoostClassifier(n_estimators=100)),
    ('sgd', SGDClassifier(loss='log')),
    ('lgbm', lgb.LGBMClassifier(
        learning_rate=0.05,
        n_estimators=150,
        reg_lambda = 1)),
    ('xgb', XGBClassifier(
        n_estimators=100,
        eval_metric='rmse'))
]

weights = [2, 1, 1, 2, 1, 1, 4, 5]

In [None]:
%%time

model = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression(solver='saga'),
    cv=3,
    n_jobs=-1,
    stack_method='predict_proba',
    verbose=0)

stacking_model = make_pipeline(
    create_preprocessor(),
    model)

model_result(stacking_model, train_data, frac=0.1)

## VotingClassifier

In [None]:
%%time

model = VotingClassifier(
    estimators=estimators, 
    voting='soft',
    n_jobs=-1,
    weights=weights)

voting_model = make_pipeline(
    create_preprocessor(),
    model)

model_result(voting_model, train_data, frac=0.1)

### Result

In [None]:
models = {
    'log': log_model,
    'lda': lda_model,
    'dt': dt_model,
    'rf': rf_model,
    'ada': ada_model,
    'sgd': sgd_model,
    'xgb': xgb_model,
    'lgbm': lgbm_model, 
    'stack': stacking_model,
    'voting': voting_model
}

X_train, X_valid, y_train, y_valid = split_data(train_data, frac=0.2)

scores = []
for (k, m) in models.items():
    y_pred = target_encoder.inverse_transform(m.predict(X_valid))
    score = accuracy_score(y_pred, y_valid)
    scores.append(score)


model_result = pd.DataFrame({
    'model': [k for (k, m) in models.items()],
    'accuracy': scores
}).sort_values(by='accuracy', ascending=False).set_index('model') 

model_result

In [None]:
idx = model_result['accuracy'].argmax()
model_name = model_result.iloc[idx].name
best_model = models[model_name]

print(f'Best model: {model_name}')

# Submission

In [None]:
y_pred_submission = target_encoder.inverse_transform(best_model.predict(test_data))

submission_data = pd.DataFrame({
    Cfg.INDEX: test_data.index,
    Cfg.TARGET: y_pred_submission,
}).set_index(Cfg.INDEX)

submission_data

In [None]:
# save submission file
submission_data.to_csv(Cfg.SUBMISSION_FILE)

<h4>If you find this notebook useful, support with an upvote.</h4>