In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import sys
import os
sys.path.append(os.path.abspath("..")) 
from utils.variables import bin_vars, ord_vars, cont_vars, cat_vars, numeric_vars, nom_vars

In [None]:
# Load the dataset
df = pd.read_csv('../data/student_preprocessed.csv')

# Display basic info about the dataset
df.info()

# Display first few rows and summary statistics
pd.set_option('display.max_columns', None)
display(df.head())  
display(df.describe()) 

In [None]:
# 1. Count Plots for Categorical Variables 

# List of categorical columns and their titles
def make_title(var):
    if var in bin_vars:
        return f"Count of Students by {var.replace('_', ' ').title()} (0=No, 1=Yes)"
    elif var in ord_vars:
        return f"Count of Students by {var.replace('_', ' ').title()} (Ordinal Scale)"
    else:
        return f"Count of Students by {var.replace('_', ' ').title()}"
    
cat_var_tuples = [(var, make_title(var)) for var in cat_vars]

# Create grid of subplots
n_vars = len(cat_var_tuples)
n_cols = 3
n_rows = int(np.ceil(n_vars / n_cols))
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
axes = axes.flatten()

# Plot each countplot
for i, (col, title) in enumerate(cat_var_tuples):
    sns.countplot(x=col, data=df, hue=col, palette="pastel", ax=axes[i])
    axes[i].set_title(title)
    # Only remove legend if it exists
    legend = axes[i].get_legend()
    if legend is not None:
        legend.remove()

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
# 2. Distributions & Skewness of Continuous Variables

# Set style
sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)

# Plot distributions and boxplots for continuous variables
fig, axes = plt.subplots(len(cont_vars), 2, figsize=(12, 18))
for i, var in enumerate(cont_vars):
    kde_flag = False if var == 'age' else True  # No KDE for age due to discrete nature

    # Histogram with KDE
    sns.histplot(df[var], kde=kde_flag, ax=axes[i, 0], color="skyblue")
    skewness = df[var].skew()
    axes[i, 0].set_title(f"Distribution of {var} (Skewness: {skewness:.2f})")
    
    # Boxplot
    sns.boxplot(x=df[var], ax=axes[i, 1], color="lightcoral")
    axes[i, 1].set_title(f"Boxplot of {var}")

plt.tight_layout()
plt.show()


In [None]:
# 3. Scatter Plots & Regression vs G3 for Numeric Variables

# Remove 'G3' itself to avoid plotting vs itself
numeric_vars = [var for var in numeric_vars if var != 'G3']

# Determine grid size
n_vars = len(numeric_vars)
n_cols = 4
n_rows = int(np.ceil(n_vars / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
axes = axes.flatten()

# Loop through variables and plot scatter + regression vs G3
for i, col in enumerate(numeric_vars):
    sns.regplot(
        x=col,
        y='G3',
        data=df,
        scatter_kws={'alpha':0.5},
        line_kws={'color':'red'},
        ax=axes[i]
    )
    axes[i].set_title(f"G3 vs {col}")

# Remove any unused axes
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
# 4. Group Comparisons (Boxplots & Bar charts)

# Determine grid size
n_vars = len(cat_vars)
n_cols = 3
n_rows = int(np.ceil(n_vars / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
axes = axes.flatten()

# Loop through variables
for i, var in enumerate(bin_vars + ord_vars):
    ax = axes[i]
    
    # Binary variables → boxplot
    if var in bin_vars:
        sns.boxplot(x=var, y='G3', data=df, ax=ax, palette='pastel', hue=var)
        ax.set_title(f"G3 by {var}")
        ax.get_legend().remove()
    
    # Ordinal variables → barplot of mean G3
    elif var in ord_vars:
        sns.barplot(x=var, y='G3', data=df, errorbar=None, ax=ax, palette='pastel', hue=var)
        ax.set_title(f"Mean G3 by {var}")
        ax.get_legend().remove()

# Remove unused axes
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
# 5. Correlation Heatmap

# Temporarily encode nominal categorical variables using one-hot encoding
df_encoded = df.copy()
df_encoded = pd.get_dummies(df_encoded, columns=nom_vars, drop_first=True)
corr_target = df_encoded.corr()['G3'].sort_values(ascending=False)


plt.figure(figsize=(8,12))
sns.barplot(x=corr_target.values, y=corr_target.index, palette="coolwarm", hue=corr_target.index)
plt.title("Feature Correlation with G3")
plt.show()

In [None]:
# Correlation Matrix for Predictors

X = df_encoded.drop(columns='G3')  # predictors only

plt.figure(figsize=(12,10))
sns.heatmap(
    X.corr(),
    cmap="coolwarm",
    vmin=-1, vmax=1,
    annot=False
)
plt.title("Correlation Matrix of Predictors")
plt.show()