In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')

In [None]:
df = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")

In [None]:
df.head()

In [None]:
print(df.shape)
df.info()

**Observations:**
- The float columns are missing values, the integers are not.

In [None]:
feature_cols = [i for i in df.columns if "F" in i]
float_cols = [i for i in df.columns if df[i].dtype == float]
int_cols = [i for i in df.columns if df[i].dtype == int and "F" in i]

In [None]:
print("There are: ", len(float_cols), "float features")
print("There are: ", len(int_cols), "integer features")

In [None]:
plt.subplots(figsize=(25,125))
for i, column in enumerate(float_cols):
    plt.subplot(19,3,i+1)
    sns.histplot(data=df, x=column)
    plt.title(column)

In [None]:
plt.subplots(figsize=(25,70))
for i, column in enumerate(int_cols):
    val_count = df[column].value_counts()
    ax = plt.subplot(9,3,i+1)
    #sns.barplot(x=val_count.index,y=val_count.values)
    ax.bar(val_count.index, val_count.values)
    ax.set_xticks(val_count.index)
    plt.title(column)

**Correlations between features:**

In [None]:
plt.subplots(figsize=(25,20))
sns.heatmap(df[float_cols].corr(), annot= True, cmap="RdYlGn", fmt = '0.1f', vmin=-1, vmax=1, cbar=False);

In [None]:
plt.subplots(figsize=(25,20))
sns.heatmap(df[int_cols].corr(), annot= True, cmap="RdYlGn", fmt = '0.2f', vmin=-1, vmax=1, cbar=False);

# Missing Data

In [None]:
missing_data_df = df.isna().sum().rename("missing_values").reset_index().rename(columns={"index":"column"})

In [None]:
f,ax = plt.subplots(figsize=(25,30))
sns.barplot(data = missing_data_df, y="column", x="missing_values", color="blue", orient="h");

Number of missing values per row:

In [None]:
missing_data_df = df.isna().sum(axis=1).rename("missing_values").reset_index().rename(columns={"index":"row"})
val_count = missing_data_df["missing_values"].value_counts()

In [None]:
plt.subplots(figsize=(15,7))
ax = sns.barplot(x=val_count.index, y=val_count.values, color="blue");
ax.set_xlabel("Missing values in row");
ax.set_ylabel("Count");

Looking at nullity correlation: Does the missingness of one column affect the missingness of another:

In [None]:
import missingno as msno
msno.heatmap(df);

**Observations:**

- No nullity correlation, the missingness of features are independent from each other.

## Work In Progress