# Description

For this competition, you will be predicting a categorical target based on a number of feature columns given in the data. The data is synthetically generated by a GAN that was trained on a the data from the [Forest Cover Type Prediction](https://www.kaggle.com/c/forest-cover-type-prediction/overview). This dataset is (a) much larger, and (b) may or may not have the same relationship to the target as the original data.

Please refer to this [data page](https://www.kaggle.com/c/forest-cover-type-prediction/data) for a detailed explanation of the features.

The target is `Cover_Type`

# Data overview

In [None]:
import datatable as dt # for quicker loading of dataframes
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_df = dt.fread('/kaggle/input/tabular-playground-series-dec-2021/train.csv').to_pandas()

In [None]:
train_df.info()

In [None]:
train_df

Id column is not needed, so it will be dropped

In [None]:
train_df.drop(columns='Id', inplace=True)

Overview of data

In [None]:
train_df.describe().T.style.background_gradient(cmap = 'Blues')\
                           .bar(subset = ["mean",], color = 'lightgreen')\
                           .bar(subset = ["std"], color = '#ee1f5f')\
                           .bar(subset = ["max"], color = '#FFA07A')

In [None]:
plt.figure(figsize=(20, 10))
plt.xticks(rotation=45)
ax = sns.boxplot(data=train_df.select_dtypes(include=['int32']))

Check for null/missing values

In [None]:
train_df.isna().any().sum()

Check for duplicates

In [None]:
train_df.duplicated().any().sum()

Target balance

In [None]:
sns.countplot(x=train_df.Cover_Type);

In [None]:
train_df.Cover_Type.value_counts()

**Target is not well balanced**

# Data correlations and distributions

In [None]:
# obtain a smaller subset of samples
train_sample = train_df.sample(n=100000, random_state=42)
train_sample.shape

In [None]:
corr_all = train_sample.corr()
#corr_all

In [None]:
sns.set(style="white", font_scale=1)
mask = np.zeros_like(corr_all, dtype=np.bool) # Generate a mask for the upper triangle
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(24, 18))
f.suptitle("Correlation Matrix", fontsize = 10)
cmap = sns.diverging_palette(220, 10, as_cmap=True) # Generate a custom diverging colormap
sns.heatmap(corr_all, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

Some observations:
* Soil type 7 and 15 are all nil
* Wilderness Area 1 and 3 are negatively correlated

In [None]:
train_df['Soil_Type7'].sum(), train_df['Soil_Type15'].sum()

In [None]:
train_df.drop(columns=['Soil_Type7', 'Soil_Type15'], inplace=True)
train_df.shape

In [None]:
train_sample = train_df.sample(n=100000, random_state=42)
train_sample.shape

In [None]:
train_sample.corrwith(train_sample.Cover_Type).plot.bar(figsize=(20,5),
                  title='Correlation with Target Variable',
                  fontsize=10, rot=90,
                  grid=True);

Data distribution (on a subset of 100k samples)

In [None]:
fig, axes = plt.subplots(5,2,figsize=(10, 7))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.kdeplot(data=train_sample, x=train_sample.columns[idx], fill=True, ax=ax)
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel(''); ax.set_ylabel('')
    ax.set_title(train_sample.columns[idx], loc='right', fontsize=12)
fig.tight_layout()
plt.show()

Data distrubution considering the target:

In [None]:
fig, axes = plt.subplots(5,2,figsize=(20, 15))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.kdeplot(data=train_sample, x=train_sample.columns[idx], fill=True, ax=ax, hue='Cover_Type', legend=idx==0)
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel(''); ax.set_ylabel('')
    ax.set_title(train_sample.columns[idx], loc='right', fontsize=12)
fig.tight_layout()
plt.show()

## Analysis for boolean values

In [None]:
boolean_df = train_df.select_dtypes(include=['bool'])

In [None]:
boolean_sum = boolean_df.sum()
boolean_sum

In [None]:
booleans_with_target = boolean_df.join(pd.Series(train_df.Cover_Type, name='Cover_Type'))
booleans_with_target

Group each boolean column with the cover type to calculate the percentage representation for each one

In [None]:
group_wa_1 = booleans_with_target.groupby(['Wilderness_Area1', 'Cover_Type'])['Cover_Type'].count()
group_wa_1

In [None]:
group_wa_1_df = group_wa_1.reset_index(level=[0])
group_wa_1_df

In [None]:
cover_counts_by_type = train_df.Cover_Type.value_counts()
cover_counts_by_type.sort_index()

In [None]:
def apply_cover_count_to_row(row):
#     print(row.name, row[1], cover_counts_by_type[row.name])
    return row[1] / cover_counts_by_type[row.name]

group_wa_1_df.apply(apply_cover_count_to_row, axis=1)

We can observe that the cover type adds to 1 for each true/false pair, e.g. 0.701415 + 0.298585

In [None]:
def apply_cover_count_percent(row):
    return 1 - (row[1] / cover_counts_by_type[row.name])

group_wa_1_df[group_wa_1_df.Wilderness_Area1 == False].apply(apply_cover_count_percent, axis=1)

This is interpreted as follows: Wilderness Area 1 is present in 29.85% of Cover Type 1, 26.67% of Cover Type 2, and so on

Applying it to all boolean columns, we get a full table with these percentages

In [None]:
percents_df = pd.DataFrame(columns=boolean_df.columns)
percents_df

In [None]:
def apply_cover_count_percent(row):
    return 1 - (row[1] / cover_counts_by_type[row.name])

for column in boolean_df.columns:
    group_by_cover_type = booleans_with_target.groupby([column, 'Cover_Type'])['Cover_Type'].count()
    group_by_cover_type = group_by_cover_type.reset_index(level=[0])
    percents = group_by_cover_type[group_by_cover_type[column] == False].apply(apply_cover_count_percent, axis=1)
    percents_df[column] = percents

In [None]:
percents_df

In [None]:
# the heatmap works without problem even with NaNs though.
percents_df = percents_df.replace(np.nan, 0)
percents_df

In [None]:
sns.set(style="white", font_scale=1)
f, ax = plt.subplots(figsize=(20, 6))
f.suptitle("Percentage map by color", fontsize = 10)
cmap = sns.diverging_palette(220, 10, as_cmap=True) # Generate a custom diverging colormap
sns.heatmap(percents_df, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});