In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Read The Data

In [None]:
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv')

# 2. EDA

In [None]:
print('The number of categorical target :', len(train_data['Cover_Type'].unique()))

In [None]:
train_data.head()

## 2-1. The count of each Cover_Type

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

train_data['Count'] = 1
train_data_cover_type = train_data.groupby('Cover_Type').sum()
sns.set()
sns.barplot(x = train_data_cover_type.index, y = train_data_cover_type.Count)
plt.title('The relation between Cover_Type and Count')

## 2-2. Elevation features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
fig, axes = plt.subplots(1, 7, figsize=(50, 10))
fig.suptitle('The features : Elevation', fontsize=30)
for i in range(7):
    Cover_type_cate = train_data[train_data['Cover_Type'] == (i+1)]
    sns.histplot(x =  Cover_type_cate.Elevation, ax = axes[i])
    axes[i].set_title(f'Cover_Type : {i+1}', fontsize=30)
    axes[i].set_xlabel('Elevation', fontsize=30)
    axes[i].set_ylabel('Count', fontsize=30)


## 2-3. Aspect Feature

In [None]:
sns.set()
fig, axes = plt.subplots(1, 7, figsize=(40, 10))
fig.suptitle('The features : Aspect', fontsize=30)
for i in range(7):
    Cover_type_cate = train_data[train_data['Cover_Type'] == (i+1)]
    sns.histplot(x =  Cover_type_cate.Aspect, ax = axes[i])
    axes[i].set_title(f'Cover_Type : {i+1}', fontsize=30)
    axes[i].set_xlabel('Aspect', fontsize=30)
    axes[i].set_ylabel('Count', fontsize=30)

## 2-4. Slope Feature

In [None]:
sns.set()
fig, axes = plt.subplots(1, 7, figsize=(40, 10))
fig.suptitle('The features : Slope', fontsize=30)
for i in range(7):
    Cover_type_cate = train_data[train_data['Cover_Type'] == (i+1)]
    sns.histplot(x =  Cover_type_cate.Slope, ax = axes[i])
    axes[i].set_title(f'Cover_Type : {i+1}', fontsize=30)
    axes[i].set_xlabel('Slope', fontsize=30)
    axes[i].set_ylabel('Count', fontsize=30)

## 2-5. Horizontal_Distance_To_Hydrology Feature

In [None]:
sns.set()
fig, axes = plt.subplots(1, 7, figsize=(40, 10))
fig.suptitle('The features : Horizontal_Distance_To_Hydrology', fontsize=30)
for i in range(7):
    Cover_type_cate = train_data[train_data['Cover_Type'] == (i+1)]
    sns.histplot(x =  Cover_type_cate.Horizontal_Distance_To_Hydrology, ax = axes[i])
    axes[i].set_title(f'Cover_Type : {i+1}', fontsize=30)
    axes[i].set_xlabel('Horizontal_Distance_To_Hydrology', fontsize=15)
    axes[i].set_ylabel('Count', fontsize=30)

## 2-6. Vertical_Distance_To_Hydrology Feature

In [None]:

sns.set()
fig, axes = plt.subplots(1, 7, figsize=(40, 10))
fig.suptitle('The features : Vertical_Distance_To_Hydrology ', fontsize=30)
for i in range(7):
    Cover_type_cate = train_data[train_data['Cover_Type'] == (i+1)]
    sns.histplot(x =  Cover_type_cate.Vertical_Distance_To_Hydrology, ax = axes[i])
    axes[i].set_title(f'Cover_Type : {i+1}', fontsize=30)
    axes[i].set_xlabel('Vertical_Distance_To_Hydrology', fontsize=15)
    axes[i].set_ylabel('Count', fontsize=30)

## 2-7. Horizontal_Distance_To_Roadways Feature

In [None]:
sns.set()
fig, axes = plt.subplots(1, 7, figsize=(40, 10))
fig.suptitle('The features : Horizontal_Distance_To_Roadways ', fontsize=30)
for i in range(7):
    Cover_type_cate = train_data[train_data['Cover_Type'] == (i+1)]
    sns.histplot(x =  Cover_type_cate.Horizontal_Distance_To_Roadways, ax = axes[i])
    axes[i].set_title(f'Cover_Type : {i+1}', fontsize=30)
    axes[i].set_xlabel('Horizontal_Distance_To_Roadways', fontsize=15)
    axes[i].set_ylabel('Count', fontsize=30)

# 3. Deep Learning Model

In [None]:
train_data = train_data.drop(columns = ['Count', 'Id'])
test_data = test_data.drop(columns = ['Id'])

In [None]:
from sklearn.preprocessing import MinMaxScaler
data = train_data.drop(columns = 'Cover_Type')
target = train_data['Cover_Type']

Scale = MinMaxScaler()
data = Scale.fit_transform(data)
test_data = Scale.transform(test_data)

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model, to_categorical
x_train, x_test, y_train, y_test = train_test_split(data, target, train_size = 0.8, random_state = 10)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import plot_model, to_categorical
model = Sequential()
model.add(Dense(128, activation = 'relu', input_shape = (x_train.shape[1],)))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(8, activation = 'softmax'))
model.compile(optimizer= 'adam', 
              loss = 'categorical_crossentropy', 
              metrics = ['accuracy'])

history = model.fit(x_train, y_train, epochs = 30, validation_split = 0.2, batch_size = 512, verbose = 0)

## 3-1. Model's Training Process

In [None]:
df_DL = pd.DataFrame(history.history)
df_DL.head()

In [None]:
plt.plot(df_DL.index, df_DL['loss'], label = 'loss')
plt.plot(df_DL.index, df_DL['val_loss'], label = 'Val_loss')
plt.xlabel( 'Epochs')
plt.ylabel('Binary_crossentropy')
plt.title('DL loss function')
plt.legend()

In [None]:
y_pred = model.predict(x_test)

In [None]:
y_test = np.argmax(y_test, axis = 1)
y_pred = np.argmax(y_pred, axis = 1)

## 3-2. Comparsion between Reality and Prediction -> Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
plt.figure(figsize = (15, 15))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)

# 4. Submission

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv')
submission['Cover_Type'] = np.argmax(model.predict(test_data), axis = 1)
submission.to_csv('submission.csv', index=False)

In [None]:
submission['Cover_Type'].unique()