In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
train.set_index('Id', inplace=True)

# code for lossless compression from GUILLAUME MARTIN:    https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train = reduce_mem_usage(train)

# Summary

In this notebook I want to present a few of the features I derived from the dataset. The first feature is simply the sum of soil-types at a location. This feature seems to have a high importance. I also grouped the soil-types by their similarity with regards to the cover-types. The resulting features describe how many soil-types of a certain group are present at a location. There seems to be some importance for these features.

# Sum of Soil-Types

In [None]:
# sum of soil types
train['sum_s'] = train.filter(regex='Soil').sum(axis=1)

In [None]:
sns.boxenplot(x='Cover_Type', y='sum_s', data=train, palette='cool')
plt.show()

# Grouping the Soil-Types

In [None]:
# new dataframe describing how often a certain soil-type occurs at the same location as a cover-type
for_feat = train[train['Cover_Type'] != 5].filter(regex='Soil')
for_feat['y'] = train['Cover_Type']
for_feat = for_feat.groupby('y').mean().transpose()
# removing soil-types without any related cover-types
for_feat = for_feat[for_feat.sum(axis=1) != 0]

In [None]:
for_feat.head()

In [None]:
scl = StandardScaler()
for_feat = pd.DataFrame(scl.fit_transform(for_feat), index=for_feat.index, columns=for_feat.columns)

In [None]:
km = KMeans(n_clusters=5, random_state=69)
for_feat['Cluster'] = km.fit_predict(for_feat)

In [None]:
for c in for_feat.Cluster.unique():
    train['c' + str(c) + '_sum'] = train[for_feat[for_feat['Cluster'] == c].index].sum(axis=1)

In [None]:
for_feat['Cluster'].value_counts()

Cluster 2 seems to be pointless as a new feature as it only contains one soil-type

In [None]:
pal = ['#02ABB7', '#674076', "#F62E97", '#E672E0', 'crimson']

fig, axs = plt.subplots(2, 3, figsize=(19,15))

# top ten industries by cluster
sns.scatterplot(ax= axs[0,0], x=1, y=2, hue='Cluster', palette=pal, data=for_feat)
sns.scatterplot(ax= axs[0,1], x=1, y=3, hue='Cluster', palette=pal, data=for_feat)
sns.scatterplot(ax= axs[0,2], x=1, y=4, hue='Cluster', palette=pal, data=for_feat)
sns.scatterplot(ax= axs[1,0], x=1, y=6, hue='Cluster', palette=pal, data=for_feat)
sns.scatterplot(ax= axs[1,1], x=1, y=7, hue='Cluster', palette=pal, data=for_feat)
axs[1,2].set_axis_off()
plt.show()

# Feature Importance

In [None]:
y_train = train.pop('Cover_Type')
cat = CatBoostClassifier(iterations = 3000, depth= 6, verbose=100, task_type= 'GPU')
cat.fit(train, y_train)

In [None]:
feature_importance = pd.DataFrame()
feature_importance['f_importance'] = cat.get_feature_importance()
feature_importance['feature'] = train.columns
feature_importance = feature_importance.sort_values('f_importance', ascending=False)

fig = plt.subplots(figsize=(10,15))
sns.barplot(x="f_importance", y="feature", data=feature_importance, palette='cool')
plt.tight_layout()
plt.show()