In [None]:
## Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
## Constants
dir = '../input/tabular-playground-series-may-2021/'

In [None]:
## Read data
train = pd.read_csv(dir + 'train.csv')
test = pd.read_csv(dir + 'test.csv')
sample_submission = pd.read_csv(dir + 'sample_submission.csv')
print(f"Train: {train.shape}")
print(f"Test: {test.shape}")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
## Utility functions
def plot_count(col, xlabel='X', ylabel='Y',title='Value Counts'):
    col_count = train[col].value_counts()
    plt.bar(col_count.index, col_count.values)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

### Target

Target Classes (Class 1-4) seems to be highly unbalanced

In [None]:
## Lets check the target
plot_count(
    col="target",
    xlabel="Target classes",
    ylabel="Count",
    title="Distribution of Target Classes"
)

In [None]:
fig_size = (20 ,20)
train.drop(['id', 'target'], axis=1).hist(figsize=fig_size)
plt.title('Distribution of features in training data');

In [None]:
fig_size = (20 ,20)
test.drop('id', axis=1).hist(figsize=fig_size)
plt.title('Distribution of features in test data');

In [None]:
## NaN check
train.isnull().any().any(), test.isnull().any().any()

### Lets plot the distribution of values in features

In [None]:
train.feature_0.plot.kde()

In [None]:
test.feature_0.plot.kde()

In [None]:
train.feature_9.plot.kde()

### Check for duplicate rows in the data

In [None]:
duplicated_rows = train[train.drop(['id','target'], axis=1).duplicated()]
duplicated_row_id = duplicated_rows.id.tolist()
print(duplicated_row_id)
duplicated_rows

### Duplicate columns in the datatset

In [None]:
duplicated_cols = train.T.duplicated().T
duplicated_cols.any()

### Lets check some similar distributes columns

In [None]:
col = 'feature_11'
plot_count(
    f"{col}",
    f"{col}",
    'count',
    f"Distribution of {col}"
)
col = 'feature_12'
plot_count(
    f"{col}",
    f"{col}",
    'count',
    f"Distribution of {col}"
)

In [None]:
print(train.feature_11.value_counts())
print(train.feature_12.value_counts())


### Observations:

* Distiributions are very skewed for all the columns.

* Maybe the 0 value in all the features might represent NaN or unknown class which has been replaced by 0 or these could be an outlier.

* All the feature values seem to be continous.

* 4 rows have been duplicated but none of the columns are duplicated.

* Few columns have almost similar distributions but yet differ in some values.


## Baseline submission

In [None]:
train.target.value_counts(normalize=True)

In [None]:
sample_submission.Class_1 = 0.08490
sample_submission.Class_2 = 0.57497
sample_submission.Class_3 = 0.21420
sample_submission.Class_4 = 0.12593

In [None]:
sample_submission.to_csv('submission.csv', index=False)