In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Tabular Playground Series - Apr 2022

# Exploratory Data Analysis


<h2>Train dataset</h2>

Lets first explore the train dataset and subsequently the test dataset.

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
labels_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df.tail()

<h2>1. Structure Investigation</h2>

In [None]:
pd.value_counts(train_df.dtypes)

<h2>1.1. Structure of numerical features</h2>

In [None]:
train_df.describe(include='number').T

In [None]:
unique_values = train_df.select_dtypes(include='number').nunique().sort_values()

unique_values.plot.bar(logy=True, figsize=(15, 8), title='Unique Values per feature')

# 1.3. Conclusion of structure investigation

The dataset being explored has 16 features to deal with and a lot of samples roughly 1mi without any missing value. There are 3 features of type int64 and 13 of type float64. 

<h2>2. Quality Investigation</h2>

<h3>2.1. Duplicates</h3>

In [None]:
n_duplicates = train_df.duplicated().sum()
print(f"There are {n_duplicates} duplicates samples.")

<h3>2.2. Missing values</h3>

In [None]:
missing = train_df.isnull().sum()
print(f"Missing values per feature: {missing}")

<h2>3. Content Investigation</h2>

<h3>3.1. Feature distribution</h3>

In [None]:
train_df.hist(bins=25, figsize=(20, 10), layout=(-1, 5), edgecolor='black')

<h3>3.2. Feature patterns</h3>

In [None]:
train_df.plot(lw=0, marker=".", subplots=True, layout=(-1, 2), markersize=0.1, figsize=(15, 6))

<h3>3.3. Feature Relationship</h3>

In [None]:
df_corr = train_df.corr(method="pearson")

# Create labels for the correlation matrix
labels = np.where(np.abs(df_corr)>0.75, "S",
                  np.where(np.abs(df_corr)>0.5, "M",
                           np.where(np.abs(df_corr)>0.25, "W", "")))

# Plot correlation matrix
plt.figure(figsize=(15, 15))

sns.heatmap(df_corr, mask=np.eye(len(df_corr)),
            square=True,
            center=0,
            annot=labels,
            fmt='',
            linewidths=0.5,
            cmap="vlag",
            cbar_kws={"shrink": 0.8}
           )

# Labels Data

In [None]:
labels_df.head()

In [None]:
labels_df.tail()

In [None]:
labels_df.shape

In [None]:
labels_df.info()

In [None]:
labels_df.nunique()

In [None]:
sns.countplot(labels_df.state)

# Test Dataset

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')

In [None]:
test_df.shape

<h2>1. Structure Investigation</h2>

In [None]:
pd.value_counts(test_df.dtypes)

<h2>1.1. Structure of numerical features</h2>

In [None]:
test_df.describe(include='number').T

In [None]:
unique_values = test_df.select_dtypes(include='number').nunique().sort_values()

unique_values.plot.bar(logy=True, figsize=(15, 8), title='Unique Values per feature')

# 1.3. Conclusion of structure investigation

The dataset being explored has 16 features to deal with and a lot of samples roughly 1mi without any missing value. There are 3 features of type int64 and 13 of type float64. 

<h2>2. Quality Investigation</h2>

<h3>2.1. Duplicates</h3>

In [None]:
n_duplicates = test_df.duplicated().sum()
print(f"There are {n_duplicates} duplicates samples.")

<h3>2.2. Missing values</h3>

In [None]:
missing = test_df.isnull().sum()
print(f"Missing values per feature: {missing}")

<h2>3. Content Investigation</h2>

<h3>3.1. Feature distribution</h3>

In [None]:
test_df.hist(bins=25, figsize=(20, 10), layout=(-1, 5), edgecolor='black')

<h3>3.2. Feature patterns</h3>

In [None]:
test_df.plot(lw=0, marker=".", subplots=True, layout=(-1, 2), markersize=0.1, figsize=(15, 6))

<h3>3.3. Feature Relationship</h3>

In [None]:
df_corr = test_df.corr(method="pearson")

# Create labels for the correlation matrix
labels = np.where(np.abs(df_corr)>0.75, "S",
                  np.where(np.abs(df_corr)>0.5, "M",
                           np.where(np.abs(df_corr)>0.25, "W", "")))

# Plot correlation matrix
plt.figure(figsize=(15, 15))

sns.heatmap(df_corr, mask=np.eye(len(df_corr)),
            square=True,
            center=0,
            annot=labels,
            fmt='',
            linewidths=0.5,
            cmap="vlag",
            cbar_kws={"shrink": 0.8}
           )

<h3>Conclusion</h3>

There are some features with a lot of outliers and features that are highly correlated. Doing some work on it can improve the results of a predictive model.