# Student Performance — Visualizations (Q2)

Ingestion → Preprocessing → 5 tasks (V1–V5). 800×600 px, 300 DPI.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
FIG_W, FIG_H = 800/300, 600/300
DPI = 300
FIG_DIR = os.path.join("reports", "figures")
REPORTS_DIR = "reports"
RAW_PATH = "StudentsPerformance.csv"
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

## 1. Ingestion

In [None]:
df = pd.read_csv(RAW_PATH)
df.columns = [c.strip().strip('"') for c in df.columns]
print(df.shape, list(df.columns))
df.head()

## 2. Preprocessing

In [None]:
score_cols = ["math score", "reading score", "writing score"]
for c in score_cols: df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=score_cols)
df["overall_avg"] = df[score_cols].mean(axis=1)
df["test_prep"] = df["test preparation course"].str.strip().str.lower()
df["lunch_type"] = df["lunch"].str.strip().str.lower()
df.head()

## V1 — Gender boxplots (math vs reading)

In [None]:
fig, ax = plt.subplots(figsize=(FIG_W, FIG_H), dpi=DPI)
plot_df = df[["gender", "math score", "reading score"]].melt(id_vars="gender", value_vars=["math score", "reading score"], var_name="Subject", value_name="Score")
plot_df["Subject"] = plot_df["Subject"].str.replace(" score", "")
sns.boxplot(data=plot_df, x="gender", y="Score", hue="Subject", ax=ax, palette={"math": "#2ecc71", "reading": "#3498db"})
ax.set_title("V1: Math vs Reading by Gender")
ax.set_ylabel("Score (points)"); ax.set_xlabel("Gender")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "V1_gender_boxplots_math_reading.png"), dpi=DPI, bbox_inches="tight")
plt.show()

## V2 — Test prep impact on math

In [None]:
fig, ax = plt.subplots(figsize=(FIG_W, FIG_H), dpi=DPI)
sns.boxplot(data=df, x="test preparation course", y="math score", order=["none", "completed"], palette={"none": "#e74c3c", "completed": "#27ae60"})
ax.set_title("V2: Math Score by Test Preparation")
ax.set_ylabel("Math score (points)"); ax.set_xlabel("Test preparation course")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "V2_test_prep_math.png"), dpi=DPI, bbox_inches="tight")
plt.show()

## V3 — Lunch type and mean overall_avg

In [None]:
fig, ax = plt.subplots(figsize=(FIG_W, FIG_H), dpi=DPI)
means = df.groupby("lunch_type", as_index=False)["overall_avg"].mean()
means["label"] = means["lunch_type"].replace({"free/reduced": "Free/reduced", "standard": "Standard"})
bars = ax.bar(means["label"], means["overall_avg"], color=["#9b59b6", "#1abc9c"], edgecolor="black")
for b in bars: ax.text(b.get_x()+b.get_width()/2, b.get_height()+0.3, f"{b.get_height():.1f}", ha="center", fontsize=9)
ax.set_title("V3: Mean Overall Average by Lunch Type")
ax.set_ylabel("Mean overall avg (points)"); ax.set_xlabel("Lunch type")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "V3_lunch_overall_avg.png"), dpi=DPI, bbox_inches="tight")
plt.show()

## V4 — Correlation heatmap

In [None]:
fig, ax = plt.subplots(figsize=(FIG_W, FIG_H), dpi=DPI)
corr_df = df[["math score", "reading score", "writing score"]].rename(columns={"math score": "Math", "reading score": "Reading", "writing score": "Writing"})
sns.heatmap(corr_df.corr(), annot=True, fmt=".2f", cmap="Blues", center=0.5, vmin=0.7, vmax=1, ax=ax, cbar_kws={"label": "Correlation"})
ax.set_title("V4: Correlation — Math, Reading, Writing")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "V4_subject_correlation_heatmap.png"), dpi=DPI, bbox_inches="tight")
plt.show()

## V5 — Math vs reading scatter + trend lines by test prep

In [None]:
def fit_line(x, y):
    A = np.vstack([x, np.ones(len(x))]).T
    return np.linalg.lstsq(A, y, rcond=None)[0]
completed = df[df["test_prep"] == "completed"]
none_grp = df[df["test_prep"] == "none"]
fig, ax = plt.subplots(figsize=(FIG_W, FIG_H), dpi=DPI)
ax.scatter(none_grp["reading score"], none_grp["math score"], alpha=0.6, label=f"None (n={len(none_grp)})", color="#e74c3c", s=15)
ax.scatter(completed["reading score"], completed["math score"], alpha=0.6, label=f"Completed (n={len(completed)})", color="#27ae60", s=15)
m_n, c_n = fit_line(none_grp["reading score"].values, none_grp["math score"].values)
m_c, c_c = fit_line(completed["reading score"].values, completed["math score"].values)
x_line = np.array([df["reading score"].min(), df["reading score"].max()])
ax.plot(x_line, m_n*x_line+c_n, color="#e74c3c", linestyle="--", linewidth=2, label=f"None fit (slope={m_n:.3f})")
ax.plot(x_line, m_c*x_line+c_c, color="#27ae60", linestyle="--", linewidth=2, label=f"Completed fit (slope={m_c:.3f})")
ax.set_xlabel("Reading score (points)"); ax.set_ylabel("Math score (points)")
ax.set_title("V5: Math vs Reading by Test Prep with Trend Lines")
ax.legend(loc="lower right", fontsize=7)
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "V5_math_vs_reading_by_test_prep.png"), dpi=DPI, bbox_inches="tight")
plt.show()

## Report

Interpretations (5–8 sentences each) are in `reports/visualization_report.md`. Run `03_student_performance_visualizations.py` to regenerate.