# Setup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt

import pickle
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data load

In [None]:
df_train_0 = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv')

In [None]:
df_train_0.head()

In [None]:
print(df_train_0.shape)

In [None]:
info = [
    [c, df_train_0[c].dtype, len(df_train_0[c].unique()), sum(df_train_0[c].isna()), df_train_0[c].min(), df_train_0[c].max()]
    for c in df_train_0.columns
]
pd.DataFrame(info, columns=['c_name','c_type','# unique values', '# null values', 'min value', 'max value'])

All features are integer, no null values, Soil_Type7 and Soil_Type15 contain no information and can be dropped.

In [None]:
df_train_0.drop(columns=['Soil_Type7','Soil_Type15'], inplace=True)

Wilderness_Area# and Soil_Type## seems all dummy variables derived from a categorical variable. Are they mutually exclusive?

In [None]:
df_train_0.loc[:,[c for c in df_train_0.columns if 'Wilderness_Area' in c]].sum(axis=1).value_counts()

In [None]:
df_train_0.loc[:,[c for c in df_train_0.columns if 'Soil_Type' in c]].sum(axis=1).value_counts()

No, they are not, I cannot reconduce them to a single categorical variable. I'll keep them as they are. Maybe add some features with the count of the different variables for each record.

In [None]:
df_train_0['cnt_Wilderness_Area'] = df_train_0.loc[:,[c for c in df_train_0.columns if 'Wilderness_Area' in c]].sum(axis=1)
df_train_0['cnt_Soil_Type'] = df_train_0.loc[:,[c for c in df_train_0.columns if 'Soil_Type' in c]].sum(axis=1)

I can also calculate some different distance measures for hidrology

In [None]:
df_train_0['Euler_Distance_To_Hydrology'] = np.sqrt(np.power(df_train_0['Horizontal_Distance_To_Hydrology'],2) + np.power(df_train_0['Vertical_Distance_To_Hydrology'],2))
df_train_0['Manhattan_Distance_To_Hydrology'] = np.abs(df_train_0['Horizontal_Distance_To_Hydrology'] - df_train_0['Vertical_Distance_To_Hydrology'])

# Exploratory analysis

## Univariate analysis

### Numerical features

In [None]:
univariate_cols = [
    'Elevation',
    'Aspect',
    'Slope',
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    'cnt_Wilderness_Area',
    'cnt_Soil_Type',
    'Euler_Distance_To_Hydrology',
    'Manhattan_Distance_To_Hydrology',
]
fig, axs = plt.subplots(7, 2, sharey=True, tight_layout=True, figsize=(10,20))

for c,a in zip(univariate_cols,axs.ravel()):
    a.hist(df_train_0[c], bins=20);
    a.set_title(c);

Aspect and Slope shoul be in degrees, so in the interval \[0;360). Are they?

In [None]:
print(df_train_0['Aspect'].min())
print(df_train_0['Aspect'].max())
print(df_train_0['Slope'].min())
print(df_train_0['Slope'].max())

They are not... I'll make sure they are and create some indicator variables

In [None]:
df_train_0['flag_Aspect_oob'] = ((df_train_0['Aspect'] < 0) | (df_train_0['Aspect'] > 359)).astype(int)
df_train_0['flag_Slope_oob'] = ((df_train_0['Slope'] < 0) | (df_train_0['Slope'] > 359)).astype(int)
print(sum(df_train_0['flag_Aspect_oob']), sum(df_train_0['flag_Aspect_oob'])/len(df_train_0['flag_Aspect_oob']))
print(sum(df_train_0['flag_Slope_oob']), sum(df_train_0['flag_Slope_oob'])/len(df_train_0['flag_Slope_oob']))

In [None]:
df_train_0['Aspect'] = np.mod(df_train_0['Aspect'], 360)
print(min(df_train_0['Aspect']))
print(max(df_train_0['Aspect']))

df_train_0['Slope'] = np.mod(df_train_0['Slope'], 360)
print(min(df_train_0['Slope']))
print(max(df_train_0['Slope']))

Also the Hillshade variables should be in the interval \[0,255\]

In [None]:
df_train_0['flag_Hillshade_9am_oob'] = ((df_train_0['Hillshade_9am'] < 0) | (df_train_0['Hillshade_9am'] > 255)).astype(int)
print(sum(df_train_0['flag_Hillshade_9am_oob']), sum(df_train_0['flag_Hillshade_9am_oob'])/len(df_train_0['flag_Hillshade_9am_oob']))

df_train_0['flag_Hillshade_Noon_oob'] = ((df_train_0['Hillshade_Noon'] < 0) | (df_train_0['Hillshade_Noon'] > 255)).astype(int)
print(sum(df_train_0['flag_Hillshade_Noon_oob']), sum(df_train_0['flag_Hillshade_Noon_oob'])/len(df_train_0['flag_Hillshade_Noon_oob']))

df_train_0['flag_Hillshade_3pm_oob'] = ((df_train_0['Hillshade_3pm'] < 0) | (df_train_0['Hillshade_3pm'] > 255)).astype(int)
print(sum(df_train_0['flag_Hillshade_3pm_oob']), sum(df_train_0['flag_Hillshade_3pm_oob'])/len(df_train_0['flag_Hillshade_3pm_oob']))

In [None]:
df_train_0['Hillshade_9am'] = df_train_0['Hillshade_9am'].apply(lambda x: x if x >= 0 else 0).apply(lambda x: x if x <= 255 else 255)
print(min(df_train_0['Hillshade_9am']))
print(max(df_train_0['Hillshade_9am']))

df_train_0['Hillshade_Noon'] = df_train_0['Hillshade_Noon'].apply(lambda x: x if x >= 0 else 0).apply(lambda x: x if x <= 255 else 255)
print(min(df_train_0['Hillshade_Noon']))
print(max(df_train_0['Hillshade_Noon']))

df_train_0['Hillshade_3pm'] = df_train_0['Hillshade_3pm'].apply(lambda x: x if x >= 0 else 0).apply(lambda x: x if x <= 255 else 255)
print(min(df_train_0['Hillshade_3pm']))
print(max(df_train_0['Hillshade_3pm']))

Now let's plot again the distributions:

In [None]:
fig, axs = plt.subplots(7, 2, sharey=True, tight_layout=True, figsize=(10,20))

for c,a in zip(univariate_cols,axs.ravel()):
    a.hist(df_train_0[c], bins=20);
    a.set_title(c);

### Binary features

In [None]:
binary_cols = [c for c in df_train_0.columns if 'Wilderness_Area' in c and c != 'cnt_Wilderness_Area']

xticks = [i for i,c in enumerate(df_train_0[binary_cols].columns)]
xlabels = binary_cols
means = list(df_train_0[binary_cols].mean(axis=0))

fig = plt.figure(figsize=(6,6))
a = fig.gca()

a.plot(xticks, means,'o-');
a.set_title("Wilderness_Area cols mean value");
a.set_xticks(xticks);
a.set_xticklabels(xlabels, rotation=45);

#for i,c in enumerate(df_train_0[univariate_cols].columns):
#    axs.plot(i,df_train_0[c].mean(),'x')

In [None]:
binary_cols = [c for c in df_train_0.columns if 'Soil_Type' in c and c != 'cnt_Soil_Type']

xticks = [i for i,c in enumerate(df_train_0[binary_cols].columns)]
xlabels = binary_cols
means = list(df_train_0[binary_cols].mean(axis=0))

fig = plt.figure(figsize=(20,6))
a = fig.gca()

a.plot(xticks, means,'o-');
a.set_title("Soil_Type cols mean value");
a.set_xticks(xticks);
a.set_xticklabels(xlabels, rotation=45);

### Labels

In [None]:
for l in sorted(df_train_0['Cover_Type'].unique()):
    num_events = sum(pd.Series(df_train_0['Cover_Type'].values == l))
    perc_events = num_events/df_train_0.shape[0]
    print(l,num_events,perc_events)

Classes 1 and 2 are well represented in the train dataset.
Classes 3, 6 and 7 are underrepresented, and some oversampling may be needed
Class 5 is practically non-existent. Better to just drop and ignore it.

In [None]:
df_train_0 = df_train_0.loc[df_train_0['Cover_Type'].values != 5,:]

### Final comments on the univariate analysis

...

## Bivariate analysis

### Correlation Matrix

In [None]:
numeric_cols = [
    'Elevation',
    'Aspect',
    'Slope',
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Euler_Distance_To_Hydrology',
    'Manhattan_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    'flag_Aspect_oob',
    'flag_Slope_oob',
    'flag_Hillshade_9am_oob',
    'flag_Hillshade_Noon_oob',
    'flag_Hillshade_3pm_oob',
    'cnt_Wilderness_Area',
    'cnt_Soil_Type',
]
binary_cols_1 = [c for c in df_train_0.columns if 'Wilderness_Area' in c]
binary_cols_2 = [c for c in df_train_0.columns if 'Soil_Type' in c]
label_cols = ['Cover_Type']

In [None]:
cols = numeric_cols+label_cols
df_corr = (
    df_train_0
    .loc[:,cols]
    .corr()
)

ticks = [i for i in range(len(cols))]

fig = plt.figure(figsize=(10,10))
a = fig.gca()

im = a.imshow(df_corr);
cbar = a.figure.colorbar(im, ax=a);
a.set_xticks(ticks);
a.set_xticklabels(cols,rotation='vertical');
a.set_yticks(ticks);
a.set_yticklabels(cols);

# Loop over data dimensions and create text annotations.
list(a.text(j, i, "%0.2f"%df_corr.values[i, j], ha="center", va="center", color="w") for i in ticks for j in ticks);
#for i in ticks:
#    for j in ticks:
#        text = a.text(j, i, "%0.2f"%df_corr.values[i, j], ha="center", va="center", color="w")

In [None]:
cols = binary_cols_1+label_cols
df_corr = (
    df_train_0
    .loc[:,cols]
    .corr()
)

fig = plt.figure(figsize=(10,10))
a = fig.gca()

ticks = [i for i in range(len(cols))]

im = a.imshow(df_corr);
cbar = a.figure.colorbar(im, ax=a);
a.set_xticks(ticks);
a.set_xticklabels(cols,rotation='vertical');
a.set_yticks(ticks);
a.set_yticklabels(cols);

# Loop over data dimensions and create text annotations.
for i in ticks:
    for j in ticks:
        text = a.text(j, i, "%0.2f"%df_corr.values[i, j], ha="center", va="center", color="w")

In [None]:
cols = binary_cols_2+label_cols
df_corr = (
    df_train_0
    .loc[:,cols]
    .corr()
)

fig = plt.figure(figsize=(20,20))
a = fig.gca()

ticks = [i for i in range(len(cols))]

im = a.imshow(df_corr);
cbar = a.figure.colorbar(im, ax=a);
a.set_xticks(ticks);
a.set_xticklabels(cols,rotation='vertical');
a.set_yticks(ticks);
a.set_yticklabels(cols);

# Loop over data dimensions and create text annotations.
for i in ticks:
    for j in ticks:
        text = a.text(j, i, "%0.2f"%df_corr.values[i, j], ha="center", va="center", color="w")

### Final comments on the bivariate analysis

# Models

In [None]:
id_column = 'Id'
label_column = 'Cover_Type'
feat_columns = [c for c in df_train_0.columns if c not in [id_column,label_column]]

In [None]:
RANDOM_SEED = 42
TRAIN_FRACTION = 0.97

p_id_train = set(np.random.choice(df_train_0[id_column].values, size=int(df_train_0.shape[0]*TRAIN_FRACTION), replace=False))
p_id_test = set(df_train_0[id_column]) - p_id_train
print(len(p_id_train))
print(len(p_id_test))

df_train = df_train_0.loc[df_train_0[id_column].isin(p_id_train)].reset_index(drop=True)
df_test = df_train_0.loc[df_train_0[id_column].isin(p_id_test)].reset_index(drop=True)

In [None]:
pickle.dump(df_train, open(f"df_train.pkl", "wb" ) )
pickle.dump(df_test, open(f"df_test.pkl", "wb" ) )

In [None]:
del df_train, df_test, df_train_0

## Catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
clf = CatBoostClassifier(
    task_type="CPU",
    #iterations=50,
    #learning_rate=0.2,
    #max_depth=1,
    loss_function='MultiClass',
    #verbose = False 
)

In [None]:
df_test = pickle.load(open("df_test.pkl", "rb" ))
X_val = df_test.drop(columns=['Id','Cover_Type']).astype(float)
Y_val = df_test['Cover_Type'].values.astype(str) 

df = pickle.load(open("df_train.pkl", "rb" ))
X=df.drop(columns=['Id','Cover_Type']).astype(float)
Y=df['Cover_Type'].values.astype(str)

#print(pd.Series(Y).value_counts())
#print(pd.Series(Y_val).value_counts())

clf.fit(X, Y, eval_set=(X_val, Y_val))
clf.save_model('model.cbm')

del df

In [None]:
model = CatBoostClassifier()
model.load_model('model.cbm')

# Get predicted classes
preds_class = model.predict(X_val)

# Get predicted probabilities for each class
preds_proba = model.predict_proba(X_val)

In [None]:
labels = sorted([str(x) for x in set(df_test[label_column])])
labels

In [None]:
fig = plt.figure(figsize=(6,6))
a = fig.gca()

lw = 2

for li,l in enumerate(labels):
    ytest_l = np.array(Y_val == l,dtype=int)
    ytest_pred_l = preds_proba[:,li]

    fpr, tpr, _ = roc_curve(ytest_l, ytest_pred_l)
    roc_auc = auc(fpr, tpr)

    plt.plot(
        fpr,
        tpr,
        #color="darkorange",
        lw=lw,
        label="ROC curve label %s (area = %0.2f)" % (l,roc_auc),
    )

plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()

In [None]:
final_data = {}

final_data['Id'] = df_test['Id']
final_data['Cover_Type'] = preds_class.ravel()

df_final_data = pd.DataFrame(final_data)

In [None]:
df_final_data

In [None]:
df_final_data['Cover_Type'].value_counts()

## Submission

In [None]:
df_score_0 = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv')

df_score_0.drop(columns=['Soil_Type7','Soil_Type15'], inplace=True)

df_score_0['cnt_Wilderness_Area'] = df_score_0.loc[:,[c for c in df_score_0.columns if 'Wilderness_Area' in c]].sum(axis=1)
df_score_0['cnt_Soil_Type'] = df_score_0.loc[:,[c for c in df_score_0.columns if 'Soil_Type' in c]].sum(axis=1)

df_score_0['Euler_Distance_To_Hydrology'] = np.sqrt(np.power(df_score_0['Horizontal_Distance_To_Hydrology'],2) + np.power(df_score_0['Vertical_Distance_To_Hydrology'],2))
df_score_0['Manhattan_Distance_To_Hydrology'] = np.abs(df_score_0['Horizontal_Distance_To_Hydrology'] - df_score_0['Vertical_Distance_To_Hydrology'])

df_score_0['flag_Aspect_oob'] = ((df_score_0['Aspect'] < 0) | (df_score_0['Aspect'] > 359)).astype(int)
df_score_0['flag_Slope_oob'] = ((df_score_0['Slope'] < 0) | (df_score_0['Slope'] > 359)).astype(int)
df_score_0['Aspect'] = np.mod(df_score_0['Aspect'], 360)
df_score_0['Slope'] = np.mod(df_score_0['Slope'], 360)

df_score_0['flag_Hillshade_9am_oob'] = ((df_score_0['Hillshade_9am'] < 0) | (df_score_0['Hillshade_9am'] > 255)).astype(int)
df_score_0['flag_Hillshade_Noon_oob'] = ((df_score_0['Hillshade_Noon'] < 0) | (df_score_0['Hillshade_Noon'] > 255)).astype(int)
df_score_0['flag_Hillshade_3pm_oob'] = ((df_score_0['Hillshade_3pm'] < 0) | (df_score_0['Hillshade_3pm'] > 255)).astype(int)
df_score_0['Hillshade_9am'] = df_score_0['Hillshade_9am'].apply(lambda x: x if x >= 0 else 0).apply(lambda x: x if x <= 255 else 255)
df_score_0['Hillshade_Noon'] = df_score_0['Hillshade_Noon'].apply(lambda x: x if x >= 0 else 0).apply(lambda x: x if x <= 255 else 255)
df_score_0['Hillshade_3pm'] = df_score_0['Hillshade_3pm'].apply(lambda x: x if x >= 0 else 0).apply(lambda x: x if x <= 255 else 255)

In [None]:
df_score_0.shape

In [None]:
Xscore = df_score_0.drop(columns=['Id']).astype(float)

# Load trained model from file
model = CatBoostClassifier()
model.load_model('model.cbm')

# Make predictions
preds_class = model.predict(Xscore)

In [None]:
scored_data = {}

scored_data['Id'] = df_score_0['Id']
scored_data['Cover_Type'] = preds_class.ravel()

df_scored_data = pd.DataFrame(scored_data)

In [None]:
df_scored_data

In [None]:
df_scored_data['Cover_Type'].value_counts()

In [None]:
df_scored_data.to_csv('submission.csv', index=False)