In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read The Data

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/train.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.Soil_Type1.value_counts()

__As we can see the `Soils` columns indicate if the product belongs to this type (1) or not (0)__

> `Wrangle` The Data

In [None]:
train_df.isnull().sum().sum()

In [None]:
train_df.duplicated().sum()

In [None]:
train_df.drop("Id", axis=1, inplace=True)

In [None]:
train_df.Cover_Type.value_counts().plot(kind='barh', fontsize= 15, color='gold')
plt.xlabel("The Cover-Type")
plt.ylabel("The Count")
plt.title("The Freq Of Cover Types", fontsize=20)
plt.show()

In [None]:
train_df.Cover_Type.value_counts()

__From The Previous Cells we can see that (Data are `Im_Balanced` and `Not-Scaled`)__

In [None]:
plt.figure(figsize = (15, 8))
sns.heatmap(train_df.iloc[:,:10].corr(), cmap="coolwarm")
plt.title("The Correlation Matrix For The Continous Features")
plt.show()

# Imbalanced Data

In [None]:
import imblearn
from imblearn.under_sampling import RandomUnderSampler

In [None]:
X = train_df.drop("Cover_Type", axis=1).values
y = train_df.Cover_Type.values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X, y = undersample.fit_resample(X, y)

In [None]:
from collections import Counter
print(Counter(y))

In [None]:
X.shape

In [None]:
y.shape

In [None]:
columns = train_df.columns[:-1]
balanced_df = pd.DataFrame(X, columns=columns)
balanced_df['Cover_Type'] = y

In [None]:
balanced_df.head()

# Scale Data

In [None]:
from sklearn.preprocessing import MinMaxScaler

cols = train_df.iloc[:,:10].columns 
scaler = MinMaxScaler()

balanced_df[cols] = scaler.fit_transform(balanced_df[cols])

In [None]:
balanced_df.head()

In [None]:
sns.pairplot(balanced_df.iloc[:,:3] , diag_kind='hist')
plt.show()

In [None]:
sns.pairplot(balanced_df.iloc[:,3:6], diag_kind='hist')
plt.show()

In [None]:
sns.pairplot(balanced_df.iloc[:,6:9], diag_kind='hist')
plt.show()

# Dimensionality Reduction 

In [None]:
# De-Enocde The Wilderness_Area columns

In [None]:
balanced_df['Wilderness_Area'] = balanced_df.iloc[:,10:14].sum(axis=1)

In [None]:
balanced_df.drop(balanced_df.iloc[:,10:14], axis=1, inplace=True)

In [None]:
# De-Enocde The Soil_Type columns

In [None]:
balanced_df['Soil_Type'] = balanced_df.iloc[:,10:50].sum(axis=1)

In [None]:
balanced_df.drop(balanced_df.iloc[:,10:50], axis=1, inplace=True)

In [None]:
balanced_df.info()

# Modelling 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split

In [None]:
X = balanced_df.drop("Cover_Type", axis=1).values
y = balanced_df.Cover_Type.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
# Model Structure
model = Sequential()
model.add(Dense(256, input_dim = 12, activation = 'relu', kernel_initializer = 'he_normal'))
model.add(Dense(128, activation = 'relu', kernel_initializer = 'he_normal'))
model.add(Dense(8, activation='softmax'))  # the output layer

In [None]:
model.summary()

In [None]:
# compile the keras model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=2)

In [None]:
pd.DataFrame(model.history.history).plot()
plt.xlabel('epochs')

In [None]:
# evaluate on test set
from sklearn.metrics import accuracy_score

yhat = model.predict(X_test)
yhat = np.argmax(yhat, axis=-1).astype('int')

In [None]:
acc = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % acc)