In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
train

In [None]:
train.target.unique()

In [None]:
train.target.value_counts()

In [None]:
fig = px.histogram(train,'target', histnorm='percent')
fig.show()
#histnorm gives the format of y. By default it gives the count of value occurences of x variable

In [None]:
fig = px.pie(train,values='feature_48',names='target',title='Class Distribution')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
#In px.pie, data visualized by the sectors of the pie is set in values. The sector labels are set in names.

Class_6 and Class 8 alone rakes up >50% of the samples. While Class_5 and Class_4 have very less representation

In [None]:
fig = px.scatter(train, x = 'feature_0', y= 'feature_7', 
                 hover_name='target', color='target')
fig.show()
#hovername gives the value of the point as we move our cursor over the plot

# Correlation Plot

In [None]:
corr_mat = train.corr()
mask = np.zeros_like(corr_mat)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(20,20))
sns.heatmap(corr_mat, annot=False, mask=mask,
           linewidths=.5)

In [None]:
some_columns = train.columns[:8]
sns.pairplot(data=train[some_columns], kind='scatter')

In [None]:
train.drop(columns='id',inplace=True)  #not required

In [None]:
train.describe()

In [None]:
px.scatter(train.describe().T, x='mean', y= 'max', size='std',
          hover_name='std', title='Describe Plot')

We see that mean and max are mostly positively correlated, except for a few features. Also, in general std dev is high for higher mean and max values.

# Unique Counts of each Feature

In [None]:
feat_unique_counts= np.zeros(len(train.columns[:-1]))
c=0
for i in train.columns[:-1]:
    feat_unique_counts[c]= train[i].nunique()
    c+=1

plt.figure(figsize=(20,8))
plt.grid()
plt.xticks(rotation=90)
plt.stem(train.columns[:-1],feat_unique_counts)

It's hard to distinguish numerical and categorical values as there are no binary/trinary features

# Visualising Zero & Non-Zero proportion in Features

In [None]:
features = train.columns[:-1]
zero_counts, nonzero_counts= np.zeros(len(features)),np.zeros(len(features))
c=0

for i in features:
    zero_counts[c] = train[i].value_counts()[0]/len(train)
    nonzero_counts[c] = 1-zero_counts[c]
    c+=1
    
valuecounts_df = pd.DataFrame(data=zero_counts,
                             columns=['zeros'],
                             index=features)

valuecounts_df['non-zeros'] = nonzero_counts
valuecounts_df

In [None]:
labels = ['0','!0']
explode = [0.1,0]

fig, ax = plt.subplots(15,5, figsize=(20,20))
for  i ,feature  in enumerate(features , 1):
    plt.subplot(15, 5, i)
    plt.pie(valuecounts_df.T[feature], labels=labels,explode=explode)
    plt.xlabel(feature, fontsize=9)
    
fig.tight_layout()
plt.show()

In [None]:
mean_diff = np.zeros(len(features))
c=0
for i in features:
    mean_diff[c] = train[i].mean()-test[i].mean()
    c+=1
    
px.bar(mean_diff, hover_name=mean_diff,
      title='Mean difference between train & test sets')

Difference in mean is quite small (<0.06 in almost all features). However train means > test means in almost all the features.

In [None]:
px.histogram(train, y='feature_12', x='target')

# Encoding the Classes

In [None]:
enc = {
    'Class_1':0.0,
    'Class_2':1.0,
    'Class_3':2.0,
    'Class_4':3.0,
    'Class_5':4.0,
    'Class_6':5.0,
    'Class_7':6.0,
    'Class_8':7.0,
    'Class_9':8.0    
}

train.target.replace(to_replace=enc,inplace=True)
train.target

In [None]:
X = train.drop(columns='target').to_numpy()
y = train.target.to_numpy()
test = test.drop(columns='id').to_numpy()

X.shape,test.shape