In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<h1>Tabular Playground Series - May 2022</h1>

The May edition of the 2022 Tabular Playground series binary classification problem that includes a number of different feature interactions. This competition is an opportunity to explore various methods for identifying and exploiting these feature interactions.

For this challenge, you are given (simulated) manufacturing control data and are tasked to predict whether the machine is in state 0 or state 1. The data has various feature interactions that may be important in determining the machine state.

<h1>1 Exploratory data analysis</h1>

<h2>1.1 Train Set</h2>

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv')

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
train_df.drop('id', axis=1, inplace=True)

In [None]:
pd.value_counts(train_df.dtypes)

In [None]:
train_df.describe(include='number').T

In [None]:
train_df.describe(exclude='number').T

In [None]:
unique_values = train_df.select_dtypes(include='number').nunique().sort_values()

unique_values.plot.bar(logy=True, figsize=(15, 8), title='Unique Values per feature')

In [None]:
unique_values = train_df.select_dtypes(exclude='number').nunique().sort_values()

unique_values.plot.bar(logy=True, figsize=(15, 8), title='Unique Values per feature')

In [None]:
n_duplicates = train_df.duplicated().sum()
print(f"There are {n_duplicates} duplicates samples.")

In [None]:
n_duplicates = train_df.drop('target', axis=1).duplicated().sum()
print(f"There are {n_duplicates} duplicates samples.")

In [None]:
missing = train_df.isnull().sum()
print(f"Missing values per feature: {missing}")

In [None]:
train_df.drop('target', axis=1).hist(bins=25, figsize=(20, 10), layout=(-1, 5), edgecolor='black')

In [None]:


df_corr = train_df.corr(method="pearson")

# Create labels for the correlation matrix
labels = np.where(np.abs(df_corr)>0.75, "S",
                  np.where(np.abs(df_corr)>0.5, "M",
                           np.where(np.abs(df_corr)>0.25, "W", "")))

# Plot correlation matrix
plt.figure(figsize=(15, 15))

sns.heatmap(df_corr, mask=np.eye(len(df_corr)),
            square=True,
            center=0,
            annot=labels,
            fmt='',
            linewidths=0.5,
            cmap="vlag",
            cbar_kws={"shrink": 0.8}
           )


In [None]:
sns.countplot(train_df['target'])

In [None]:
train_df['target'].value_counts()

In [None]:
train_df['target'].value_counts()/len(train_df) * 100

<h3>1.1.1 Train Set EDA Conclusion</h3>

The train set contains 900000 samples with 33 features, where 16 of them are fo type float64, 15 of type int64 and 1 of type object. The only feature of type object has values that are sequences of characters that encodes some information that might be useful when properly encoded. There are 741354 unique values for this variable.  

There are no duplicated values nor missing values. The features f00 through f06 and f19-f28 have a normal distribution. The other features are right skewed. None of these features have high correlation with each other.

The target variable is well balanced with 51.35% of zeroes and 48.65% of ones.

In [None]:
del train_df
gc.collect()

<h2>1.2 Test Set</h2>

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
test_df.tail()

In [None]:
test_df.drop('id', axis=1, inplace=True)

In [None]:
pd.value_counts(test_df.dtypes)

In [None]:
test_df.describe(include='number').T

In [None]:
test_df.describe(exclude='number').T

In [None]:
unique_values = test_df.select_dtypes(include='number').nunique().sort_values()

unique_values.plot.bar(logy=True, figsize=(15, 8), title='Unique Values per feature')

In [None]:
unique_values = test_df.select_dtypes(exclude='number').nunique().sort_values()

unique_values.plot.bar(logy=True, figsize=(15, 8), title='Unique Values per feature')

In [None]:
n_duplicates = test_df.duplicated().sum()
print(f"There are {n_duplicates} duplicates samples.")

In [None]:
missing = test_df.isnull().sum()
print(f"Missing values per feature: {missing}")

In [None]:
test_df.hist(bins=25, figsize=(20, 10), layout=(-1, 5), edgecolor='black')

In [None]:
df_corr = test_df.corr(method="pearson")

labels = np.where(np.abs(df_corr)>0.75, "S",
                  np.where(np.abs(df_corr)>0.5, "M",
                           np.where(np.abs(df_corr)>0.25, "W", "")))

plt.figure(figsize=(15, 15))

sns.heatmap(df_corr, mask=np.eye(len(df_corr)),
            square=True,
            center=0,
            annot=labels,
            fmt='',
            linewidths=0.5,
            cmap="vlag",
            cbar_kws={"shrink": 0.8}
           )

In [None]:
del test_df
gc.collect()

<h3>1.2.1 Test Set EDA Conclusion</h3>

Same conclusions of the train set. Both sets have similar characteristics.