# Students Performance - Three-Stage Workflow

**Ingest → Process → Analyze** - Step-by-step analysis using StudentsPerformance.csv.

In [1]:
import os
import pandas as pd
import numpy as np

RAW_PATH = "StudentsPerformance.csv"
PROCESSED_PATH = os.path.join("data", "processed", "students_performance_processed.csv")
REPORTS_DIR = "reports"
os.makedirs(os.path.dirname(PROCESSED_PATH), exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

## Stage 1: Ingest

Load raw CSV into a pandas DataFrame.

In [2]:
df = pd.read_csv(RAW_PATH)
df.columns = [c.strip().strip('"') for c in df.columns]
print("Shape:", df.shape)
print("Columns:", list(df.columns))
df.head()

Shape: (1000, 8)
Columns: ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'math score', 'reading score', 'writing score']


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
print("Data types:")
df.dtypes

Data types:


gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

In [4]:
print("Missing values:")
df.isnull().sum()

Missing values:


gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

## Stage 2: Process

Unit standardization, feature engineering (total/average score, GradeBand, passed), encoding (test_prep_binary, lunch_standard, gender_male, one-hot GradeBand).

In [5]:
score_cols = ["math score", "reading score", "writing score"]
for c in score_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")
df["total_score"] = df[score_cols].sum(axis=1)
df["average_score"] = df[score_cols].mean(axis=1).round(2)

def score_to_grade(avg):
    if avg >= 90: return "A"
    elif avg >= 80: return "B"
    elif avg >= 70: return "C"
    elif avg >= 60: return "D"
    else: return "F"
df["GradeBand"] = df["average_score"].map(score_to_grade)
df["passed"] = (df["average_score"] >= 60).astype(int)
df["test_prep_binary"] = (df["test preparation course"].str.lower() == "completed").astype("int8")
df["lunch_standard"] = (df["lunch"].str.lower() == "standard").astype("int8")
if "gender" in df.columns:
    df["gender_male"] = (df["gender"].str.lower() == "male").astype("int8")
if "GradeBand" in df.columns:
    grade_dummies = pd.get_dummies(df["GradeBand"], prefix="GradeBand")
    df = pd.concat([df, grade_dummies], axis=1)
df = df.rename(columns={"math score": "math_score", "reading score": "reading_score", "writing score": "writing_score"})
df.to_csv(PROCESSED_PATH, index=False)
print("Processed data saved to:", PROCESSED_PATH)
cols_show = [c for c in ["math_score", "reading_score", "writing_score", "average_score", "GradeBand", "passed", "test_prep_binary"] if c in df.columns]
df[cols_show].head(3)

Processed data saved to: data\processed\students_performance_processed.csv


Unnamed: 0,math_score,reading_score,writing_score,average_score,GradeBand,passed,test_prep_binary
0,72,72,74,72.67,C,1,0
1,69,90,88,82.33,B,1,1
2,90,95,93,92.67,A,1,0


## Stage 3: Analyze & Report

Summary statistics, correlation matrix, key relations → `reports/findings_students.md`

In [6]:
numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns]
summary = df[numeric_cols].agg(["mean", "median", "std"]).round(4)
score_cols_renamed = [c for c in ["math_score", "reading_score", "writing_score"] if c in df.columns]
if score_cols_renamed:
    corr_matrix = df[score_cols_renamed + ["average_score", "passed", "test_prep_binary"]].corr()
else:
    corr_matrix = df[numeric_cols].corr()
corr_avg_passed = df["average_score"].corr(df["passed"]) if "passed" in df.columns else None
corr_avg_testprep = df["average_score"].corr(df["test_prep_binary"]) if "test_prep_binary" in df.columns else None

def df_to_md(d):
    return "```\n" + d.to_string() + "\n```"
findings = "# Students Performance — Findings\n\n## 1. Summary statistics (numeric columns)\n\n"
findings += df_to_md(summary)
findings += "\n\n## 2. Correlation matrix (scores and key binaries)\n\n"
findings += df_to_md(corr_matrix.round(4))
findings += "\n\n## 3. Key relations\n\n"
if corr_avg_passed is not None:
    findings += f"- **Correlation(average_score, passed):** {corr_avg_passed:.4f}\n"
if corr_avg_testprep is not None:
    findings += f"- **Correlation(average_score, test_prep_binary):** {corr_avg_testprep:.4f}\n"
findings += "\nInterpretation: Positive correlation with test_prep_binary suggests completing test preparation is associated with higher average scores.\n"
out_path = os.path.join(REPORTS_DIR, "findings_students.md")
with open(out_path, "w", encoding="utf-8") as f:
    f.write(findings)
print(summary.head())
print("\nCorrelation(average_score, passed):", corr_avg_passed)
print("Correlation(average_score, test_prep_binary):", corr_avg_testprep)
print("\nReport written to:", out_path)

        math_score  reading_score  writing_score  total_score  average_score  \
mean       66.0890        69.1690        68.0540      203.312        67.7706   
median     66.0000        70.0000        69.0000      205.000        68.3300   
std        15.1631        14.6002        15.1957       42.772        14.2573   

        passed  test_prep_binary  lunch_standard  gender_male  
mean    0.7150            0.3580          0.6450       0.4820  
median  1.0000            0.0000          1.0000       0.0000  
std     0.4516            0.4797          0.4788       0.4999  

Correlation(average_score, passed): 0.7727558444159938
Correlation(average_score, test_prep_binary): 0.2567150094710402

Report written to: reports\findings_students.md
