# Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on the student performance dataset.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append('../src')
from data_loader import load_data
from feature_engineering import create_derived_features

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# Load data
df = load_data('../data/student_data.csv')
print(f"Dataset shape: {df.shape}")
df.head()


In [None]:
# Basic information
print("Dataset Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())


In [None]:
# Target variable distribution
plt.figure(figsize=(10, 6))
df['performance_category'].value_counts().plot(kind='bar', color='steelblue')
plt.title('Distribution of Performance Categories', fontsize=16, fontweight='bold')
plt.xlabel('Performance Category', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Numerical features distribution
numerical_cols = ['age', 'study_hours', 'attendance', 'previous_grade', 'hours_sleep', 'extracurricular_hours']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col].dropna(), bins=30, color='steelblue', edgecolor='black')
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Correlation heatmap
numerical_df = df[numerical_cols + ['final_grade']]
correlation_matrix = numerical_df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap of Numerical Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# Performance by categorical features
categorical_cols = ['gender', 'parent_education', 'has_internet', 'study_method', 'transport']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols):
    performance_by_cat = df.groupby(col)['performance_category'].value_counts(normalize=True).unstack()
    performance_by_cat.plot(kind='bar', stacked=True, ax=axes[idx], colormap='viridis')
    axes[idx].set_title(f'Performance by {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Proportion')
    axes[idx].legend(title='Performance', bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Box plots: Performance vs Numerical Features
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    df.boxplot(column=col, by='performance_category', ax=axes[idx])
    axes[idx].set_title(f'{col} by Performance Category', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Performance Category')
    axes[idx].set_ylabel(col)
    axes[idx].tick_params(axis='x', rotation=45)

plt.suptitle('', fontsize=0)  # Remove default title
plt.tight_layout()
plt.show()


In [None]:
# Create and visualize derived features
df_with_features = create_derived_features(df)

derived_features = ['study_efficiency', 'academic_balance', 'performance_momentum']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, feature in enumerate(derived_features):
    df_with_features.boxplot(column=feature, by='performance_category', ax=axes[idx])
    axes[idx].set_title(f'{feature} by Performance', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Performance Category')
    axes[idx].set_ylabel(feature)
    axes[idx].tick_params(axis='x', rotation=45)

plt.suptitle('', fontsize=0)
plt.tight_layout()
plt.show()


In [None]:
# Summary statistics by performance category
print("Summary Statistics by Performance Category:")
print("=" * 70)
for category in df['performance_category'].unique():
    print(f"\n{category}:")
    print(df[df['performance_category'] == category][numerical_cols].describe())
