# EDA (Exploratory Data Analysis) of the dataset

In this notebook, explore the Abalone dataset.

Add any relevant insight for future modelling.

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
from pathlib import Path

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 500)

# Data

In [None]:
data_path = Path("../data/abalone.csv")
df = pd.read_csv(data_path)
df.head()

# EDA

In [None]:
print(f"The number of duplicates is {df.duplicated().sum()}.\n")

print(f"The dataset doesn't have missing values:\n {df.isna().sum()}")

In [None]:
# Rings +1.5 gives the age in years
df['Age'] = df['Rings'] + 1.5

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x="Age", y="Diameter", hue="Sex", alpha=0.5, s=20)
plt.title("Plot of data")
plt.suptitle("Comparison of abalones' age and diameter, categorized by sex")
plt.xlabel("Age (years)")
plt.ylabel("Diameter (mm)")

plt.legend(title="Sex", loc="upper right")

plt.show()


### Numerical Columns Analysis

In [None]:
numerical_cols = df.columns.drop(['Sex', 'Rings', 'Age'])

columns_per_row = 2
num_rows = (len(numerical_cols) + columns_per_row - 1) // columns_per_row

In [None]:
fig, axes = plt.subplots(num_rows, columns_per_row, figsize=(15, 15))

axes = axes.flatten()

for i, c in enumerate(numerical_cols):
    sns.histplot(df[c], ax=axes[i])
    axes[i].set_title(c)

for j in range(i + 1, num_rows * columns_per_row):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
fig, axes = plt.subplots(num_rows, columns_per_row, figsize=(15, 15))

axes = axes.flatten()

for i, c in enumerate(numerical_cols):
    sns.scatterplot(data=df, x=c, y="Age", hue="Sex", ax=axes[i])
    axes[i].set_title(f'{c} - Age')

for j in range(i + 1, num_rows * columns_per_row):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
df[numerical_cols].boxplot(ax=ax)

ax.set_title("Box Plot of Numerical Columns")
plt.show()

### Pearson correlation

In [None]:
df_num = df[numerical_cols]
corr = df_num.corr()

sns.heatmap(corr)

### Target Value Analysis

In [None]:

fig, axes = plt.subplots(3, 1, figsize=(10, 8))

# Histogram
sns.histplot(data=df, x="Age", binwidth=round(len(df["Age"].unique())**(1/3)), ax=axes[0])
axes[0].set_title("Histogram of abalones' ages")

# Density Plot
sns.kdeplot(data=df, x="Age", fill=True, ax=axes[1])
axes[1].set_title("Density plot of abalones' ages")

# Box Plot
sns.boxplot(data=df, x="Age", ax=axes[2])
axes[2].set_title("Box plot of abalones' ages")

plt.tight_layout()
plt.show()
