# 1 Introduction

This EDA explores the data available for the Tabular Playground Series - December 2021 competition. Simple data exploration is performed, as well as preliminary modeling.

In [None]:
import pandas as pd
import numpy as np
import gc

train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")


In [None]:
def cat_column_info(column):
    num_categories = train[column].nunique()
    print("------> {} <------".format(column))
    print("--: train - type {}".format(train[column].dtype))
    print("--: test  - type {}".format(test[column].dtype))
    print("--: train - # categories {}".format(train[column].nunique()))
    print("--: test  - # categories {}".format(test[column].nunique()))
    if num_categories < 10:
        if train[column].dtype == "int64":
            print("--: train - values {}".format(np.sort(train[column].unique())))
            print("--: test  - values {}".format(np.sort(test[column].unique())))
        else:
            print("--: train - values {}".format(train[column].unique()))
            print("--: test  - values {}".format(test[column].unique()))
    print("--: train - NaN count {}".format(train[column].isnull().values.sum()))
    print("--: test  - NaN count {}".format(test[column].isnull().values.sum()))
    print("--: train - max value {}".format(train[column].max()))
    print("--: test  - max value {}".format(test[column].max()))
    print("--: train - min value {}".format(train[column].min()))
    print("--: test  - min value {}".format(test[column].min()))
    print("")

def cont_column_info(column):
    print("------> {} <------".format(column))
    print("--: train - type {}".format(train[column].dtype))
    print("--: test  - type {}".format(test[column].dtype))
    print("--: train - min {}".format(train[column].min()))
    print("--: test  - min {}".format(test[column].min()))
    print("--: train - max {}".format(train[column].max()))
    print("--: test  - max {}".format(test[column].max()))    
    print("--: train - NaN count {}".format(train[column].isnull().values.sum()))
    print("--: test  - NaN count {}".format(test[column].isnull().values.sum()))
    print("")
    
print(": Train shape {}".format(train.shape))
print(": Test shape {}".format(test.shape))
print("")

In [None]:
features = [
    'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 
    'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
    'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 
    'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
    'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 
    'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
    'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 
    'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
    'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40']

for feature in features:
    cat_column_info(feature)

## 1.1 Training and Testing Files

Our input data consists of:

* `train.csv` - 523 MB in size, containing 56 columns and 4,000,000 rows
* `test.csv` - 129 MB in size, containing 55 columns and 1,000,000 rows

The main observation is that while it will fit in memory, model training may exert pressure on the Kaggle 16 GB CPU memory and GPU memory limitations. We should definitely explore what column formats are at play, and whether running functions to [reduce memory usage](https://www.kaggle.com/gemartin/load-data-reduce-memory-usage) on Pandas dataframes can ease pressure on memory. 

Instead of running the reduce memory usage function, we can dig a little deeper into the feature columns (see output from hidden cell above). We can see that `Soil_Type` features and `Wilderness_Area` features are binary. This means they can be recast to `int8` without problems. The `Cover_Type` feature can also be recast to `int8`. Furthermore, the max and minimum values for all remaining features suggest they can be recast to `int16`. This should reduce dataframe usage, and is simple to implement (see output from hidden cell below).

In [None]:
for feature in features:
    if feature.startswith("Soil_Type") or feature.startswith("Wilderness_Area"):
        train[feature] = train[feature].astype(np.int8)
        test[feature] = test[feature].astype(np.int8)
    else:
        train[feature] = train[feature].astype(np.int16)
        test[feature] = test[feature].astype(np.int16)
        
train["Cover_Type"] = train["Cover_Type"].astype(np.int8)

# 2 Features

## 2.1 `id` Column

The `id` column is a `int64` integer column that contains unique record indicators ranging from 0 to 3,999,999. Like most Tabular Series, this is simply an identifier for the record and is likely not going to be of use for modelling purposes.

## 2.2 `Cover_Type` Column

The `Cover_Type` column contains the class targets we are attempting to predict. This is a multi-class classification problem. We should look first to see what class breakdown we have.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
sns_params = {"palette": "bwr_r"}

counts = pd.DataFrame(train["Cover_Type"].value_counts())
ax = plt.subplots(figsize=[20, 5])
ax = sns.barplot(x=counts.index, y=counts.Cover_Type, **sns_params)
for p in ax.patches:
    ax.text(x=p.get_x()+(p.get_width()/2), y=p.get_height(), s="{:,d}".format(round(p.get_height())), ha="center")
_ = ax.set_title("Class Balance", fontsize=15)
_ = ax.set_ylabel("Number of Records", fontsize=15)
_ = ax.set_xlabel("Cover_Type", fontsize=15)

del(counts)
_ = gc.collect()

`Cover_Type` `2` is the most highly represented, followed by `Cover_Type` `1`. Rare `Cover_Type` values include `4`, `5` and `6`. With only a single example of `5`, we may be able to safely ignore it, or discover standout features that make it easy to identify, however it will likely have minimal impact on the classifier. More likely, `Cover_Type` `1` and `2` will be easy to identify, and the majority of the competition will focus on the other rare types.

## 2.3 Null Values

We should also check to see if we are missing any values in the columns.

In [None]:
# Count the number of null values that occur in each row
train["null_count"] = train.isnull().sum(axis=1)

# Group the null counts
counts = train.groupby("null_count")["Cover_Type"].count().to_dict()
null_data = {"{} Null Value(s)".format(k) : v for k, v in counts.items() if k < 6}

# Plot the null count results
pie, ax = plt.subplots(figsize=[5, 5])
colors = sns.color_palette("bwr_r")[0:5]
plt.pie(x=null_data.values(), autopct="%.2f%%", explode=[0.05]*len(null_data.keys()), labels=null_data.keys(), pctdistance=0.5, colors=colors)
_ = plt.title("Percentage of Null Values Per Row (Train Data)", fontsize=14)

del(counts)
del(null_data)
_ = gc.collect()

In [None]:
# Count the number of null values that occur in each row
test["null_count"] = test.isnull().sum(axis=1)

# Group the null counts
counts = test.groupby("null_count")["null_count"].count().to_dict()
null_data = {"{} Null Value(s)".format(k) : v for k, v in counts.items() if k < 6}

# Plot the null count results
pie, ax = plt.subplots(figsize=[5, 5])
plt.pie(x=null_data.values(), autopct="%.2f%%", explode=[0.05]*len(null_data.keys()), labels=null_data.keys(), pctdistance=0.5, colors=colors)
_ = plt.title("Percentage of Null Values Per Row (Test Data)", fontsize=14)

del(counts)
del(null_data)
_ = gc.collect()

With this competition, we're not seeing any missing values. This means we don't have to worry about imputing or creating new features based on null values.

## 2.4 Feature Value Counts

Looking at the column `dtypes`, it appears that every column is an `int64`. This would suggest we may have categorical features. Let's take a look at the counts of each feature to see if we have anything interesting.

In [None]:
feature_counts = {}
for feature in features:
    feature_counts[feature] = train[feature].nunique()

counts = pd.DataFrame.from_dict(feature_counts, orient="index", columns=["counts"])
ax = plt.subplots(figsize=[20, 10])
ax = sns.barplot(x=counts.index, y=counts.counts, **sns_params)
for p in ax.patches:
    ax.text(x=p.get_x()+(p.get_width()/2), y=p.get_height(), s="{:,d}".format(round(p.get_height())), ha="center")
_ = ax.set_title("Number of Unique Values Per Feature", fontsize=15)
_ = ax.set_ylabel("Number of Unique Values", fontsize=15)
_ = ax.set_xlabel("Feature", fontsize=15)

ax.set_xticklabels(
    ax.get_xticklabels(), 
    rotation=45, 
    horizontalalignment='right',
)

del(counts)
_ = gc.collect()

A few interesting observations here:
    
* The following may all be continuous in nature, given that there are many different discrete values:
  * `Elevation`
  * `Aspect`
  * `Slope`
  * `Horizontal_Distance_To_Hydrology`
  * `Vertical_Distance_To_Hydrology`
  * `Horizontal_Distance_To_Roadways`
  * `Hillshade_9am`
  * `Hillshade_Noon`
  * `Horizontal_Distance_To_Fire_Points`
* The following may be binary in nature, given there are only 2 discrete values:
  * `Wilderness_Area1`
  * `Wilderness_Area2`
  * `Wilderness_Area3`
  * `Wilderness_Area4`
  * `Soil_Type1`
  * `Soil_Type2`
  * `Soil_Type3`
  * `Soil_Type4`
  * `Soil_Type5`
  * `Soil_Type6`
  * `Soil_Type8`
  * `Soil_Type9`
  * `Soil_Type10`
  * `Soil_Type11`
  * `Soil_Type12`
  * `Soil_Type13`
  * `Soil_Type14`
  * `Soil_Type16`
  * `Soil_Type17`
  * `Soil_Type18`
  * `Soil_Type19`
  * `Soil_Type20`
  * `Soil_Type21`
  * `Soil_Type22`
  * `Soil_Type23`
  * `Soil_Type24`
  * `Soil_Type25`
  * `Soil_Type26`
  * `Soil_Type27`
  * `Soil_Type28`
  * `Soil_Type29`
  * `Soil_Type30`
  * `Soil_Type31`
  * `Soil_Type32`
  * `Soil_Type33`
  * `Soil_Type34`
  * `Soil_Type35`
  * `Soil_Type36`
  * `Soil_Type37`
  * `Soil_Type38`
  * `Soil_Type39`
  * `Soil_Type40`
* The following columns can be dropped altogether as they only ever have a single value, and no nulls are present:
  * `Soil_Type7`
  * `Soil_Type15`

## 2.5 Binary Column Exploration

Let's take a look to see if there is any affinity for binary values to be associated with any particular class.

In [None]:
cat_features = [
    'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
    'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 
    'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 
    'Soil_Type14', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 
    'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 
    'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 
    'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 
    'Soil_Type39', 'Soil_Type40',
]
fig, axs = plt.subplots(14, 3, figsize=(4*4, 11*3), squeeze=False, sharey=True)

ptr = 0
for row in range(14):
    for col in range(3):  
        x = train[[cat_features[ptr], "Cover_Type"]].value_counts().sort_index().to_frame().rename({0: "# of Samples"}, axis="columns").reset_index()
        g = sns.barplot(x=cat_features[ptr], y="# of Samples", hue="Cover_Type", data=x, ax=axs[row][col])
        g.legend_.remove()
        g.axvspan(0.6, 1.6, facecolor='0.1', alpha=0.3)
        plt.xlabel(cat_features[ptr])
        ptr += 1
        del(x)
handles, labels = g.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right')
plt.tight_layout()  
plt.show()

_ = gc.collect()

Here we can see quite clearly that most soil types with a value of `0` will have a corresponding `Cover_Type` of `1` or `2`. This isn't unexpected, given that the class balance is very tipped towards those two types of covers. In other words, we don't see any magic binary feature, but we weren't really expecting to see one. Perhaps looking at second order properites will give us more information.

## 2.6 `Soil_Type` Features

Let's look a little closer at some of the `Soil_Type` features. More specifically, let's look at what happens when 1 or more of them are set.

In [None]:
soil_features = [x for x in features if x.startswith("Soil_Type")]
train["soil_type_count"] = train[soil_features].sum(axis=1)
test["soil_type_count"] = test[soil_features].sum(axis=1)

x = train[["soil_type_count", "Cover_Type"]].value_counts().sort_index().to_frame().rename({0: "# of Samples"}, axis="columns").reset_index()
fig, ax = plt.subplots(figsize=(20, 10))
g = sns.barplot(x="soil_type_count", y="# of Samples", hue="Cover_Type", data=x)
g.axvspan(0.6, 1.6, facecolor='0.1', alpha=0.2)
g.axvspan(2.6, 3.6, facecolor='0.1', alpha=0.2)
g.axvspan(4.6, 5.6, facecolor='0.1', alpha=0.2)
g.axvspan(6.6, 7.6, facecolor='0.1', alpha=0.2)
g.legend_.remove()
handles, labels = g.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right')
_ = ax.set_title("Sum of Soil_Type Features vs Cover_Type", fontsize=15)
_ = ax.set_ylabel("# of Samples", fontsize=15)
_ = ax.set_xlabel("Sum of Soil_Type Binary Features = 1", fontsize=15)

del(x)
_ = gc.collect()

A few interesting things to point out:

* `Cover_Type` of `1` and `2` are predominant regardless of how many binary features are set to `1`. However, we see that the majority of cases where none are set, we have a `Cover_Type` of `1`, `2`, or `3` as the most likely candidates.
* When there are between `1` and `3` of the binary features set, we see cover types `5` and `6` begin to appear.

We would very likely gain some information by creating a column that contains the sum of the `Soil_Type` features.

## 2.7 `Wilderness_Area` Features

Let's look a little closer at some of the `Wilderness_Area` features. More specifically, let's look at what happens when 1 or more of them are set.

In [None]:
wilderness_features = [x for x in features if x.startswith("Wilderness_Area")]
train["wilderness_area_count"] = train[wilderness_features].sum(axis=1)
test["wilderness_area_count"] = test[wilderness_features].sum(axis=1)

x = train[["wilderness_area_count", "Cover_Type"]].value_counts().sort_index().to_frame().rename({0: "# of Samples"}, axis="columns").reset_index()
fig, ax = plt.subplots(figsize=(20, 10))
g = sns.barplot(x="wilderness_area_count", y="# of Samples", hue="Cover_Type", data=x)
g.axvspan(0.6, 1.6, facecolor='0.1', alpha=0.2)
g.axvspan(2.6, 3.6, facecolor='0.1', alpha=0.2)
g.legend_.remove()
handles, labels = g.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right')
_ = ax.set_title("Sum of Wilderness_Area Features vs Cover_Type", fontsize=15)
_ = ax.set_ylabel("# of Samples", fontsize=15)
_ = ax.set_xlabel("Sum of Wilderness_Area Binary Features = 1", fontsize=15)

del(x)
_ = gc.collect()

Similar to the `Soil_Type` feature sums, the `Wilderness_Area` sum shows that we have `Cover_Type` values of `6` and `7` showing when there is at least 1 `Wilderness_Area` set. 

## 2.8 Spearman Correlation

We should also check to see what variables are correlated to one another. We'll check the Spearman correlation first, since it does not make assumptions about distribution types or linearity. With Spearman correlation, we have values that range from -1 to +1. Values around either extreme end mean a neagative or positive correlation, while those around 0 mean no correlation exists. In the case of the heatmap below, any values near 0 will appear white, while negative or positive correlations will appear darker blue or darker red accordingly.

In [None]:
columns_to_check = features.copy()
columns_to_check.append("Cover_Type")
correlation_matrix = train[columns_to_check].corr(method="spearman")

from matplotlib.colors import SymLogNorm

f, ax = plt.subplots(figsize=(20, 20))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
_ = sns.heatmap(
    correlation_matrix, 
    mask=np.triu(np.ones_like(correlation_matrix, dtype=bool)), 
    cmap=sns.diverging_palette(230, 20, as_cmap=True), 
    center=0,
    square=True, 
    linewidths=.1, 
    norm=SymLogNorm(linthresh=0.03, linscale=0.03, vmin=-1.0, vmax=1.0, base=10),
    cbar=False,
)
_ = ax.set_title("Spearman Correlation Between Features", fontsize=15)

A few observations:
    
* As expected, `Soil_Type7` and `Soil_Type15` both appear in white. Since they have a single value, they do not have any correlation with any other variable.
* The `Elevation` feature appears to be strongly correlated to `Cover_Type`.
* Other correlations to `Cover_Type` include:
   * `Horizontal_Distance_To_Roadways`
   * `Horizontal_Distance_To_Fire_Points`
   * `Wilderness_Area1`
   * `Wilderness_Area2`

## 2.9 Hydrology Map

Some features may provide `Cover_Type` separation when combined. Of these, the `Distance_To_Hydrology` provides an intuitive 2D mapping for us to examine the separation of `Cover_Type` values.

In [None]:
min_horiz_distance_to_hydrology = abs(train["Horizontal_Distance_To_Hydrology"].min())
min_vert_distance_to_hydrology = abs(train["Vertical_Distance_To_Hydrology"].min())

train["horiz_distance"] = train["Horizontal_Distance_To_Hydrology"] + min_horiz_distance_to_hydrology
train["vert_distance"] = train["Vertical_Distance_To_Hydrology"] + min_vert_distance_to_hydrology

train["horiz_distance"] = train["horiz_distance"].astype(np.int16)
train["vert_distance"] = train["vert_distance"].astype(np.int16)

max_horiz_distance_to_hydrology = train["horiz_distance"].max()
max_vert_distance_to_hydrology = train["vert_distance"].max()

heatmap_array_1 = np.zeros(shape=(max_vert_distance_to_hydrology+1, max_horiz_distance_to_hydrology+1))
heatmap_array_2 = np.zeros(shape=(max_vert_distance_to_hydrology+1, max_horiz_distance_to_hydrology+1))
heatmap_array_3 = np.zeros(shape=(max_vert_distance_to_hydrology+1, max_horiz_distance_to_hydrology+1))
heatmap_array_4 = np.zeros(shape=(max_vert_distance_to_hydrology+1, max_horiz_distance_to_hydrology+1))
heatmap_array_5 = np.zeros(shape=(max_vert_distance_to_hydrology+1, max_horiz_distance_to_hydrology+1))
heatmap_array_6 = np.zeros(shape=(max_vert_distance_to_hydrology+1, max_horiz_distance_to_hydrology+1))
heatmap_array_7 = np.zeros(shape=(max_vert_distance_to_hydrology+1, max_horiz_distance_to_hydrology+1))
heatmap_array_8 = np.zeros(shape=(max_vert_distance_to_hydrology+1, max_horiz_distance_to_hydrology+1))

for index, row in train[["vert_distance", "horiz_distance", "Cover_Type"]].iterrows():
    if row["Cover_Type"] == 1:
        heatmap_array_1[row["vert_distance"], row["horiz_distance"]] += 1
    if row["Cover_Type"] == 2:
        heatmap_array_2[row["vert_distance"], row["horiz_distance"]] += 1
    if row["Cover_Type"] == 3:
        heatmap_array_3[row["vert_distance"], row["horiz_distance"]] += 1
    if row["Cover_Type"] == 4:
        heatmap_array_4[row["vert_distance"], row["horiz_distance"]] += 1
    if row["Cover_Type"] == 5:
        heatmap_array_5[row["vert_distance"], row["horiz_distance"]] += 1
    if row["Cover_Type"] == 6:
        heatmap_array_6[row["vert_distance"], row["horiz_distance"]] += 1
    if row["Cover_Type"] == 7:
        heatmap_array_7[row["vert_distance"], row["horiz_distance"]] += 1

In [None]:
heatmap_array_1 *= 255.0/heatmap_array_1.max()
heatmap_array_2 *= 255.0/heatmap_array_2.max()
heatmap_array_3 *= 255.0/heatmap_array_3.max()
heatmap_array_4 *= 255.0/heatmap_array_4.max()
heatmap_array_5 *= 255.0/heatmap_array_5.max()
heatmap_array_6 *= 255.0/heatmap_array_6.max()
heatmap_array_7 *= 255.0/heatmap_array_7.max()

fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(20, 30))
axs[0, 0].imshow(heatmap_array_1, cmap='hot', interpolation='nearest')
_ = axs[0, 0].set_title("Cover_Type = 1", fontweight="bold", size=15)
_ = axs[0, 0].grid(False)

axs[0, 1].imshow(heatmap_array_2, cmap='hot', interpolation='nearest')
_ = axs[0, 1].set_title("Cover_Type = 2", fontweight="bold", size=15)
_ = axs[0, 1].grid(False)

axs[1, 0].imshow(heatmap_array_3, cmap='hot', interpolation='nearest')
_ = axs[1, 0].set_title("Cover_Type = 3", fontweight="bold", size=15)
_ = axs[1, 0].grid(False)

axs[1, 1].imshow(heatmap_array_4, cmap='hot', interpolation='nearest')
_ = axs[1, 1].set_title("Cover_Type = 4", fontweight="bold", size=15)
_ = axs[1, 1].grid(False)

axs[2, 0].imshow(heatmap_array_5, cmap='hot', interpolation='nearest')
_ = axs[2, 0].set_title("Cover_Type = 5", fontweight="bold", size=15)
_ = axs[2, 0].grid(False)

axs[2, 1].imshow(heatmap_array_6, cmap='hot', interpolation='nearest')
_ = axs[2, 1].set_title("Cover_Type = 6", fontweight="bold", size=15)
_ = axs[2, 1].grid(False)

axs[3, 0].imshow(heatmap_array_7, cmap='hot', interpolation='nearest')
_ = axs[3, 0].set_title("Cover_Type = 7", fontweight="bold", size=15)
_ = axs[3, 0].grid(False)

axs[3, 1].imshow(heatmap_array_8, cmap='hot', interpolation='nearest')
_ = axs[3, 1].set_title("", fontweight="bold", size=15)
_ = axs[3, 1].grid(False)

A few observations:
    
* Both `Cover_Type` of `1` and `2` look very similar in distribution.
* As discussed, `Cover_Type` of `4`, `5`, and `6` are difficult to see, simply due to rarity of those classes.
* When comparing `Cover_Type` of `7` to `1`, we can see that `7` is more spread out vertically, and horizontally. This suggests that separation between `1` & `2` and `7` is possible to some degree.

In order to make use of this relationship however, we need to transform it a little into a single continuous feature. We'll calculate the Euclidean distance of each point from the origin of (0, 0) as well as the Manhattan distance. Then we'll look to see if we can separate them a little more.

In [None]:
from scipy.spatial import distance

origin = (0, 0)

def euclidean_horiz_and_vert_distance(row):
    return distance.euclidean(origin, (row["horiz_distance"], row["vert_distance"]))

train["hydrology_euclidean_distance"] = train.apply(euclidean_horiz_and_vert_distance, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
ax = sns.violinplot(data=train, y="hydrology_euclidean_distance", x="Cover_Type")
_ = ax.set_title("Hydrology Euclidean Distance vs Cover_Type", fontsize=15)
_ = ax.set_ylabel("Hydrology - Euclidean Distance", fontsize=15)
_ = ax.set_xlabel("Cover_Type", fontsize=15)

General observation is that the `Cover_Type` of `4` has a very different hydrology distribution than the other types. This again may help us separate out that particular class. Let's look at the Manhattan distance as well.

In [None]:
def manhattan_horiz_and_vert_distance(row):
    return distance.cityblock(origin, (row["horiz_distance"], row["vert_distance"]))

train["hydrology_manhattan_distance"] = train.apply(manhattan_horiz_and_vert_distance, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
ax = sns.violinplot(data=train, y="hydrology_manhattan_distance", x="Cover_Type")
_ = ax.set_title("Hydrology Manhattan Distance vs Cover_Type", fontsize=15)
_ = ax.set_ylabel("Hydrology - Manhattan Distance", fontsize=15)
_ = ax.set_xlabel("Cover_Type", fontsize=15)

Looks like Manhattan distance may give slightly better separation to `Cover_Type` of `4`.

## 2.10 Shade Delta

One interesting feature is the `Hillshade` profile. There are 3 times of day that the shade is given:

* 9:00 am
* 12:00 pm
* 3:00 pm

While the shade itself may not be informative, we should look at the change in the amount of shade over the course of the day. 

In [None]:
train["shade_delta_1"] = train["Hillshade_9am"] - train["Hillshade_Noon"]
train["shade_delta_2"] = train["Hillshade_Noon"] - train["Hillshade_3pm"]
train["shade_delta_total"] = train["Hillshade_9am"] - train["Hillshade_3pm"]

test["shade_delta_1"] = test["Hillshade_9am"] - test["Hillshade_Noon"]
test["shade_delta_2"] = test["Hillshade_Noon"] - test["Hillshade_3pm"]
test["shade_delta_total"] = test["Hillshade_9am"] - test["Hillshade_3pm"]

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
ax = sns.violinplot(data=train, y="shade_delta_1", x="Cover_Type")
_ = ax.set_title("Change in Shade from 9:00am to 12:00pm", fontsize=15)
_ = ax.set_ylabel("Shade Delta", fontsize=15)
_ = ax.set_xlabel("Cover_Type", fontsize=15)

The change in shade looks similar across various `Cover_Type` classes. Although in the case of `Cover_Type` `4`, there is a shorter tail to the extreme ends of the shade change spectrum. 

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
ax = sns.violinplot(data=train, y="shade_delta_2", x="Cover_Type")
_ = ax.set_title("Change in Shade from 12:00pm to 3:00pm", fontsize=15)
_ = ax.set_ylabel("Shade Delta", fontsize=15)
_ = ax.set_xlabel("Cover_Type", fontsize=15)

Similar to the first shade changes we saw, in this case, we see the same lack of tail for `Cover_Type` `4`. 

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
ax = sns.violinplot(data=train, y="shade_delta_total", x="Cover_Type")
_ = ax.set_title("Change in Shade from 9:00am to 3:00pm", fontsize=15)
_ = ax.set_ylabel("Shade Delta", fontsize=15)
_ = ax.set_xlabel("Cover_Type", fontsize=15)

Overall, `Cover_Type` `4` presents a slightly different shade change distribution when compared to the shade changes in the other classes. This may provide a small amount of signal to our classifier. 

## 2.11 `Elevation`

Given that the corrleation plot showed that `Elevation` is highly correlated with the `Cover_Type`, we should look to see what the relationship looks like. 

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
ax = sns.violinplot(data=train, y="Elevation", x="Cover_Type")
_ = ax.set_title("Elevation vs Cover_Type", fontsize=15)
_ = ax.set_ylabel("Elevation", fontsize=15)
_ = ax.set_xlabel("Cover_Type", fontsize=15)

From this we can clearly see there is good separation between `Cover_Type` classes based purely on `Elevation`.  

# 3 Simple Models

Given we know a little about the distribution of data, we should establish a set of baseline models to understand what kind of performance we can get from models.

## 3.1 LightGBM

We'll start with a simple LightGBM model and see how our features work out from there.

In [None]:
# Drop Cover_Type 5, since we only have one example of it
train = train[(train["Cover_Type"] != 5)]

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
from lightgbm import early_stopping
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

target = train["Cover_Type"]
cv_rounds = 3

k_fold = StratifiedKFold(
    n_splits=cv_rounds,
    random_state=2021,
    shuffle=True,
)

train_preds = np.zeros(len(train.index), )
train_probas = np.zeros(len(train.index), )

for fold, (train_index, test_index) in enumerate(k_fold.split(train[features], target)):
    x_train = train[features].iloc[train_index]
    y_train = target.iloc[train_index]

    x_valid = train[features].iloc[test_index]
    y_valid = target.iloc[test_index]

    model = LGBMClassifier(
        random_state=2021,
        n_estimators=2000,
        verbose=-1,
        metric="softmax",
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        callbacks=[early_stopping(50)],
    )

    train_oof_preds = model.predict(x_valid)
    train_preds[test_index] = train_oof_preds
    
    print("-- Fold {}:".format(fold+1))
    print("{}".format(classification_report(y_valid, train_oof_preds)))
    print("-- Accuracy: {}".format(accuracy_score(y_valid, train_oof_preds)))

print("-- Overall:")
print("{}".format(classification_report(target, train_preds)))
print("-- Accuracy: {}".format(accuracy_score(target, train_preds)))

train["unmodified_preds"] = train_preds

# Show the confusion matrix
confusion = confusion_matrix(train["Cover_Type"], train["unmodified_preds"])
cover_labels = [1, 2, 3, 4, 6, 7]
fig, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(confusion, annot=True, fmt=",d", xticklabels=cover_labels, yticklabels=cover_labels)
_ = ax.set_title("Confusion Matrix for LGB Classifier (Unmodified Dataset)", fontsize=15)
_ = ax.set_ylabel("Actual Class")
_ = ax.set_xlabel("Predicted Class")

del(train_preds)
del(confusion)
_ = gc.collect()

As expected, we do really well with `Cover_Type` classes of `1`, `2`, and `3`.

## 3.2 `Wilderness_Area` and `Soil_Type` Counts

Let's add type counts for the binary features as described above to see if there is any lift to our model.

In [None]:
target = train["Cover_Type"]
cv_rounds = 3

k_fold = StratifiedKFold(
    n_splits=cv_rounds,
    random_state=2021,
    shuffle=True,
)

features.append("wilderness_area_count")
features.append("soil_type_count")

train_preds = np.zeros(len(train.index), )
train_probas = np.zeros(len(train.index), )

for fold, (train_index, test_index) in enumerate(k_fold.split(train[features], target)):
    x_train = train[features].iloc[train_index]
    y_train = target.iloc[train_index]

    x_valid = train[features].iloc[test_index]
    y_valid = target.iloc[test_index]

    model = LGBMClassifier(
        random_state=2021,
        n_estimators=2000,
        verbose=-1,
        metric="softmax",
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        callbacks=[early_stopping(50)],
    )

    train_oof_preds = model.predict(x_valid)
    train_preds[test_index] = train_oof_preds
    
    print("-- Fold {}:".format(fold+1))
    print("{}".format(classification_report(y_valid, train_oof_preds)))
    print("-- Accuracy: {}".format(accuracy_score(y_valid, train_oof_preds)))

print("-- Overall:")
print("{}".format(classification_report(target, train_preds)))
print("-- Accuracy: {}".format(accuracy_score(target, train_preds)))

train["type_count_preds"] = train_preds

# Show the confusion matrix
confusion = confusion_matrix(train["Cover_Type"], train["type_count_preds"])
cover_labels = [1, 2, 3, 4, 6, 7]
fig, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(confusion, annot=True, fmt=",d", xticklabels=cover_labels, yticklabels=cover_labels)
_ = ax.set_title("Confusion Matrix for LGB Classifier (Type Counts)", fontsize=15)
_ = ax.set_ylabel("Actual Class")
_ = ax.set_xlabel("Predicted Class")

del(train_preds)
del(confusion)
_ = gc.collect()

features.remove("wilderness_area_count")
features.remove("soil_type_count")

Adding the counts has given a definite boost to `Cover_Type` `7` accuracy. This in turn impacts our entire accuracy score.

## 3.3 Shade Delta

Our hypothesis above was that the delta for the shade measurements may impact `Cover_Type` `4`. Let's add in all the delta computations.

In [None]:
target = train["Cover_Type"]
cv_rounds = 3

k_fold = StratifiedKFold(
    n_splits=cv_rounds,
    random_state=2021,
    shuffle=True,
)

features.append("shade_delta_1")
features.append("shade_delta_2")
features.append("shade_delta_total")

train_preds = np.zeros(len(train.index), )
test_preds = np.zeros(len(test.index), )

for fold, (train_index, test_index) in enumerate(k_fold.split(train[features], target)):
    x_train = train[features].iloc[train_index]
    y_train = target.iloc[train_index]

    x_valid = train[features].iloc[test_index]
    y_valid = target.iloc[test_index]

    model = LGBMClassifier(
        random_state=2021,
        n_estimators=2000,
        verbose=-1,
        metric="softmax",
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        callbacks=[early_stopping(50)],
    )

    train_oof_preds = model.predict(x_valid)
    train_preds[test_index] = train_oof_preds
    
    print("-- Fold {}:".format(fold+1))
    print("{}".format(classification_report(y_valid, train_oof_preds)))
    print("-- Accuracy: {}".format(accuracy_score(y_valid, train_oof_preds)))

print("-- Overall:")
print("{}".format(classification_report(target, train_preds)))
print("-- Accuracy: {}".format(accuracy_score(target, train_preds)))

train["shade_delta_preds"] = train_preds

# Show the confusion matrix
confusion = confusion_matrix(train["Cover_Type"], train["shade_delta_preds"])
cover_labels = [1, 2, 3, 4, 6, 7]
fig, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(confusion, annot=True, fmt=",d", xticklabels=cover_labels, yticklabels=cover_labels)
_ = ax.set_title("Confusion Matrix for LGB Classifier (Shade Delta)", fontsize=15)
_ = ax.set_ylabel("Actual Class")
_ = ax.set_xlabel("Predicted Class")

del(train_preds)
del(confusion)
_ = gc.collect()

features.remove("shade_delta_1")
features.remove("shade_delta_2")
features.remove("shade_delta_total")

Again, we see a small boost to our overall accuracy, this time in `Cover_Type` `4`. 

## 3.4 Manhattan Distance of Hydrology Features

In [None]:
target = train["Cover_Type"]
cv_rounds = 3

k_fold = StratifiedKFold(
    n_splits=cv_rounds,
    random_state=2021,
    shuffle=True,
)

features.append("hydrology_manhattan_distance")

train_preds = np.zeros(len(train.index), )
test_preds = np.zeros(len(test.index), )

for fold, (train_index, test_index) in enumerate(k_fold.split(train[features], target)):
    x_train = train[features].iloc[train_index]
    y_train = target.iloc[train_index]

    x_valid = train[features].iloc[test_index]
    y_valid = target.iloc[test_index]

    model = LGBMClassifier(
        random_state=2021,
        n_estimators=2000,
        verbose=-1,
        metric="softmax",
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        callbacks=[early_stopping(50)],
    )

    train_oof_preds = model.predict(x_valid)
    train_preds[test_index] = train_oof_preds
    
    print("-- Fold {}:".format(fold+1))
    print("{}".format(classification_report(y_valid, train_oof_preds)))
    print("-- Accuracy: {}".format(accuracy_score(y_valid, train_oof_preds)))

print("-- Overall:")
print("{}".format(classification_report(target, train_preds)))
print("-- Accuracy: {}".format(accuracy_score(target, train_preds)))

train["hydrology_manhattan_preds"] = train_preds

# Show the confusion matrix
confusion = confusion_matrix(train["Cover_Type"], train["hydrology_manhattan_preds"])
cover_labels = [1, 2, 3, 4, 6, 7]
fig, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(confusion, annot=True, fmt=",d", xticklabels=cover_labels, yticklabels=cover_labels)
_ = ax.set_title("Confusion Matrix for LGB Classifier (Hydrology Manhattan Distance)", fontsize=15)
_ = ax.set_ylabel("Actual Class")
_ = ax.set_xlabel("Predicted Class")

del(train_preds)
del(confusion)
_ = gc.collect()

features.remove("hydrology_manhattan_distance")

## 3.5 Comparison of Approaches

In [None]:
bar, ax = plt.subplots(figsize=(20, 10))
ax = sns.barplot(
    x=["Unmodified", "Binary Counts", "Shade Delta", "Hydrology Manhattan Dist"],
    y=[
        float(accuracy_score(target, train["unmodified_preds"])),
        accuracy_score(target, train["type_count_preds"]),
        accuracy_score(target, train["shade_delta_preds"]),
        accuracy_score(target, train["hydrology_manhattan_preds"]),
    ],
)
_ = ax.set_title("Accuracy Score Based on Approach", fontsize=15)
_ = ax.set_xlabel("Approach")
_ = ax.set_ylabel("Accuracy Score")
_ = ax.set(ylim=(0.90, 1.0))
for p in ax.patches:
    height = p.get_height()
    ax.text(
        x=p.get_x()+(p.get_width()/2),
        y=height,
        s="{:.4f}".format(height),
        ha="center"
    )

# More to come...