## Importing Modules

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier,StackingClassifier
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

sns.set_style('darkgrid')
%matplotlib inline

## Reading the data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

In [None]:
train.head()

## Data Cleaning

In [None]:
print('Total Missing values in the train dataset : ',train.isna().sum().sum())
print('Total Missing values in the test dataset : ',test.isna().sum().sum())

In [None]:
print('Total Duplicate values in the train dataset : ',train.drop('row_id', axis = 1).duplicated().sum())
print('Total Duplicate values in the test dataset : ',test.drop('row_id', axis = 1).duplicated().sum())

In [None]:
train = train.drop(train[train.drop('row_id', axis = 1).duplicated() == True].index, axis = 0).reset_index(drop = True)

In [None]:
print('Total Duplicate values in the train dataset : ',train.drop('row_id', axis = 1).duplicated().sum())
print('Total Duplicate values in the test dataset : ',test.drop('row_id', axis = 1).duplicated().sum())

## Data Visualization

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,2,1)
train['target'].value_counts().plot(kind = 'pie', autopct = '%.2f%%')
plt.subplot(1,2,2)
sns.countplot(x = 'target', data = train)
plt.xticks(rotation = 90)
plt.show()

In [None]:
train = train.drop('row_id', axis = 1)

## Splitting

In [None]:
X = train.drop('target', axis =1)
y = train['target']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, stratify = y, random_state = 42)

## PCA (2D + 3D)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_pca = pd.DataFrame(pca.fit_transform(X_train), index = X_train.index, columns = ['PC1','PC2'])

plt.figure(figsize = (15,10))
sns.scatterplot(data = pd.concat([X_pca, y_train],axis = 1), x = 'PC1', y = 'PC2', hue = 'target')
plt.show()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 3)
X_pca = pd.DataFrame(pca.fit_transform(X_train), index = X_train.index, columns = ['PC1','PC2','PC3'])

import plotly.express as px
fig = px.scatter_3d(pd.concat([X_pca, y_train],axis = 1), x='PC1', y='PC2', z='PC3',color='target')
fig.show()

In [None]:
pca = PCA().fit(X_train)
exp_ratio = np.cumsum(pca.explained_variance_ratio_) * 100

px.area(
    x=range(1, exp_ratio.shape[0] + 1),
    y=exp_ratio,
    labels={"x": "# Components", "y": "Explained Variance"},
     width=1200, height=400
)

## Model Selection

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier()
et.fit(X_train,y_train)

y_pred = et.predict(X_test)

print('Accuracy score : {:.4f}'.format(accuracy_score(y_test,y_pred)))
print('F1 score : {:.4f}'.format(f1_score(y_test,y_pred, average = 'weighted')))

print('\n\n\nClassification Report: \n-----------------\n', classification_report(y_test,y_pred))
print('\n\n\nConfusion Matrix: \n-----------------\n')
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot = True, cmap = 'Blues', cbar = False, fmt = 'g')
plt.show()

## Preparing for submission

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier()
et.fit(X,y)

In [None]:
row_ids = test['row_id']
test = test.drop(['row_id'], axis = 1)

In [None]:
sub = pd.concat([row_ids, pd.Series(et.predict(test).squeeze(), name = 'target')], axis = 1)

In [None]:
sub

In [None]:
sub.to_csv('submission.csv', index = False)