# Data Exploration

In this notebook, we will explore the dataset, visualize data distributions, and understand relationships between features.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.ml_pipeline.data_loader import load_data

# Load the dataset
data = load_data()
data.head()

In [2]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values[missing_values > 0]

In [3]:
# Visualize the distribution of numerical features
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
    plt.subplot(3, 3, i + 1)
    sns.histplot(data[feature], bins=30, kde=True)
    plt.title(feature)
plt.tight_layout()
plt.show()

In [4]:
# Visualize relationships between features
plt.figure(figsize=(10, 6))
sns.scatterplot(x='feature1', y='feature2', data=data)
plt.title('Feature1 vs Feature2')
plt.show()

In [5]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()