# Import Libralies

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Set up Screen output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 100)

In [None]:
data_train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
data_test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
ID = data_test['id']
data_train.head()

In [None]:
data_train.isnull().sum()

# Expolre data 

In [None]:
data_train.describe()

Data has 51 columns. 
* 1 column for ID
* 1 column for targets
* 49 columns for feature

In [None]:
#Drop columns not use

data_train = data_train.drop(['id'], axis=1)
data_test = data_test.drop(['id'], axis=1)
data_train.head()

In [None]:
X = data_train.drop(['target'],axis=1)
y = data_train['target']

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X= sc.fit_transform(X)
data_test = sc.fit_transform(data_test)

**Using PCA**

Because of many features. I will plot a graph of Cumulative vs No.of components.
Select at 95% .

In [None]:
from sklearn.decomposition import PCA  #Import PCA
pca = PCA().fit(X)   

plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, 51, step=1) #Assume x is number of component, 
y1 = np.cumsum(pca.explained_variance_ratio_) #In this data, we have 51 colums

plt.ylim(0.0,1.1)
plt.plot(xi, y1, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 60, step=1)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.show()

From the graph. The dimension required is 47. Then we use n_components = 47

In [None]:
pca = PCA(n_components = 47)
X = pca.fit_transform(X)
data_test = pca.transform(data_test)

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 0)

# Import Model

Before I train model. I will find best parameters for model by using GridSerachCV.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': np.arange(1, 10),
 'criterion':['entropy','gini']}
model = GridSearchCV(DecisionTreeClassifier(), param_grid)
model.fit(X_train, y_train)
model.best_estimator_

In [None]:
best_model = DecisionTreeClassifier(max_depth = 3,criterion = 'entropy')
best_model.fit(X_train,y_train)

y_pred = best_model.predict_proba(data_test)

In [None]:
predictions = pd.DataFrame(y_pred, columns=['class_1','class_2', 'class_3', 'class_4'])
submission = pd.concat([ID,predictions], axis=1)

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)