In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import plot_model, to_categorical

# 1. Read the Data 

In [None]:
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')

# 2. Data Exploration

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print('# of NaN in train_data : ', train_data.isna().sum().sum())
print('# of NaN in test_data : ', test_data.isna().sum().sum())

In [None]:
train_data.describe()

# 3. EDA

In [None]:
sns.set()
train_data['Count'] = 1
train_data_group = train_data.groupby('target').sum()
plt.figure(figsize = (10, 10))
sns.barplot(x = train_data_group.index, y = train_data_group.Count)
plt.title('Target Count')
plt.legend()
plt.xticks(rotation = 90)

# 4. Feature Engineering

## 4-1. Remove the duplication's data

In [None]:
train_data = train_data.drop_duplicates(keep='first')

## 4-2. LabelEncoder the Target

In [None]:
Normal_target = LabelEncoder()
train_data['target'] = Normal_target.fit_transform(train_data['target'])

In [None]:
print(Normal_target.classes_)

## 4-3. Split the train_data to data and target

In [None]:
data = train_data.drop(columns = ['target', 'Count'])
target = train_data['target']
x_train, x_test, y_train, y_test = train_test_split(data, target, train_size = 0.8)

# 5. Model's Training

## 5-1. XGBClassifier

In [None]:
XG = xgb.XGBClassifier(objective = 'multi:softprob' ,eval_metric = 'mlogloss', learning_rate = 0.1, use_label_encoder=False,
                max_depth = 3, n_estimators = 800).fit(x_train, y_train)

In [None]:
print('The Score for validation on XGBClassifier :', XG.score(x_test, y_test))

In [None]:
y_pred = XG.predict(x_test)
target_names = [x for x in list(Normal_target.classes_)]
print(classification_report(y_test, y_pred , target_names = target_names))

## 5-2. ExtraTreesClassifier

In [None]:
ETC = ExtraTreesClassifier(n_estimators = 1200, n_jobs=-1).fit(x_train, y_train)

In [None]:
print('The Score for validation on ExtraTreesClassifier :', ETC.score(x_test, y_test))

In [None]:
y_pred = ETC.predict(x_test)
target_names = [x for x in list(Normal_target.classes_)]
print(classification_report(y_test, y_pred , target_names = target_names))

## 5-3. DeepLearning

In [None]:
y_train_DL = to_categorical(y_train) # for softmax to categorical
model = Sequential()
model.add(Dense(512, activation = 'relu', input_shape = (data.shape[1],)))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(10, activation = 'softmax'))
model.compile(optimizer= 'adam', 
              loss = 'categorical_crossentropy', 
              metrics = ['accuracy'])

history = model.fit(x_train, y_train_DL, epochs = 40, validation_split = 0.2, batch_size = 128, verbose = 1)

In [None]:
df_DL = pd.DataFrame(history.history)
df_DL.head()
plt.plot(df_DL.index, df_DL['loss'], label = 'loss')
plt.plot(df_DL.index, df_DL['val_loss'], label = 'Val_loss')
plt.xlabel( 'Epochs')
plt.ylabel('Binary_crossentropy')
plt.title('DL loss function')
plt.legend()

In [None]:
plt.plot(df_DL.index, df_DL['accuracy'], label = 'accuracy')
plt.plot(df_DL.index, df_DL['val_accuracy'], label = 'Val_accuracy')
plt.xlabel( 'Epochs')
plt.ylabel('Accuracy')
plt.title('DL Accuracy process')
plt.legend()

In [None]:
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis = 1) # Inverse to_categorical
target_names = [x for x in list(Normal_target.classes_)]
print(classification_report(y_test, y_pred , target_names = target_names))

# 6. Submission

In [None]:
prediction = ETC.predict(test_data)

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv')
submission["target"] = Normal_target.inverse_transform(prediction)
submission.to_csv("submission.csv", index=False)
submission