## Tabular Playground - June 2021

In [None]:
# Imports section
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Where are the files located - Their path
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. Read the files into dataframes

In [None]:
train_df = pd.read_csv("/kaggle//input//tabular-playground-series-jun-2021//train.csv")
test_df = pd.read_csv("//kaggle/input/tabular-playground-series-jun-2021/test.csv")
sample_df = pd.read_csv("//kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv")

## 2. Check dataframes to see all reading is good

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
sample_df.head()

## 3. Exploratory Analysis with notes on the train_df 

### Tasks to explore
* How many Rows and columns -- Rows - 200000, Col 77
* How many classes - 9 Classes
* Any Missing Values - No Missing Values
* Any Duplicated Rows ?? - No Duplicated Rows
* Any Duplicate Columns ?? - No Duplicated columns
* Remove the id column from the train_df - done
* Check corr plot to see if any features have strong correlation with each other -- No correlation observed
* Any irreleavent data ?? -- Outliers etc ??
* Three features - 19, 59, 60 seem to have high range or Max values
* Do we require to normalize the data ??
* Are there any low variance columns that can be elimited ?
* Are there any highly correlating columns, which can be removed in the Model building -- NO correlation between features
* What kind of classification can be used here ??. -- XGBoost

In [None]:
train_df.shape

In [None]:
# How many classes and what is thier distribution
fig = plt.figure(figsize = (10,5))
sns.countplot(x = 'target', data = train_df)
plt.show()

In [None]:
# Any missing values in the dataframe?
sum(train_df.isna().sum() >0)

In [None]:
# Any duplicated rows ? 
train_df.duplicated().sum()

In [None]:
train_df = train_df.drop(['id'], axis = 1, inplace = False)
train_df.head()

In [None]:
## Correlation heat map to see if any feature correlation
fig = plt.figure(figsize = (20,20))
sns.heatmap(train_df.corr())


In [None]:
corr = train_df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
stat_traindf= pd.DataFrame(train_df.describe())
stat_traindf = stat_traindf.T
stat_traindf = stat_traindf.drop(['count'] , axis = 1)
stat_traindf['range'] = abs(stat_traindf['max'] - stat_traindf['min'])
stat_traindf['Variance'] = stat_traindf['std']**2
stat_traindf.reset_index(inplace = True)
stat_traindf

In [None]:
#Max value
fig = plt.figure(figsize = (20,15))
sns.barplot(x = 'max', data = stat_traindf, y = 'index')
plt.show()

In [None]:
# Range 
fig = plt.figure(figsize = (20,15))
sns.barplot(x = 'range', data = stat_traindf, y = 'index')
plt.show()

In [None]:
fig = plt.figure(figsize = (20,15))
sns.barplot(x = 'Variance', data = stat_traindf, y = 'index')
plt.show()

## 4. Preparing the train_df for Machine learning training

In [None]:
from sklearn.model_selection import train_test_split
# Remove id, target columns 
X = train_df.drop(["target"], axis = 'columns', inplace = False).values
y = train_df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state=85)
#Making testdf data as an array of values
test_submit = test_df.drop("id", axis=1).values

## 5 XG Boot Classification

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(random_state=1, use_label_encoder= True, n_estimators = 90, learning_rate = 0.01)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

## 6. Model Accuracy

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred)*100)

In [None]:
sample_df.head()

## 7.  getting the test data predicted with the mode

In [None]:
y_result= model.predict_proba(test_submit)
print(y_result)

## 8. preparing the result file for submission

In [None]:
submission_result = pd.DataFrame(y_result,columns=['Class_1','Class_2','Class_3','Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
submission_result['id'] = sample_df['id']
submission_result.head()

In [None]:
submission_result.to_csv("sample_submission.csv",index=False)