<a href="https://colab.research.google.com/github/shwetakhanjan/student-learning-risk-ml/blob/main/data/01_data_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploratory Data Analysis (EDA)

## Objective
The goal of this notebook is to explore a synthetically generated student math performance dataset,
validate feature ranges and distributions, analyze the target variable (`at_risk`),
and identify patterns that will inform feature engineering and model selection.


In [6]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of students
n_students = 500

# Generate features
student_id = [f"S{i:03d}" for i in range(1, n_students+1)]
age = np.random.randint(10, 15, n_students)
grade = np.random.randint(5, 9, n_students)
homework_completion = np.random.normal(85, 10, n_students).clip(50, 100).round(2)
practice_tests_avg = np.random.normal(75, 15, n_students).clip(40, 100).round(2)

class_participation = np.random.normal(7, 2, n_students).clip(0, 10).round(2)
geometry_score = np.random.normal(70, 15, n_students).clip(30, 100).round(2)
prealgebra_score = np.random.normal(72, 12, n_students).clip(40, 100).round(2)
recent_quiz_avg = np.random.normal(70, 15, n_students).clip(30, 100).round(2)
study_hours_per_week = np.random.normal(5, 2, n_students).clip(0, 10).round(2)

# Generate target label based on low performance
# At-risk if homework < 60 OR practice_tests_avg < 60 OR geometry < 50
at_risk = ((homework_completion < 60) |
           (practice_tests_avg < 60) |
           (geometry_score < 50)).astype(int)

# Create DataFrame
df = pd.DataFrame({
    "student_id": student_id,
    "age": age,
    "grade": grade,
    "homework_completion": (homework_completion),
    "practice_tests_avg": practice_tests_avg,
    "class_participation": class_participation,
    "geometry_score": (geometry_score),
    "algebra1_score": (prealgebra_score),
    "recent_quiz_avg":(recent_quiz_avg),
    "study_hours_per_week": study_hours_per_week,
    "at_risk": at_risk
})

# Save to CSV
df.to_csv("student_math_synthetic.csv", index=False)
print(df.head(15))
print(df["at_risk"].value_counts())

   student_id  age  grade  homework_completion  practice_tests_avg  \
0        S001   13      6               100.00               59.98   
1        S002   14      8                90.39               70.78   
2        S003   12      8                74.63              100.00   
3        S004   14      5                83.10               84.61   
4        S005   14      6                76.24               66.43   
5        S006   11      5                71.17               83.59   
6        S007   12      7                94.26               95.99   
7        S008   12      7               100.00               88.87   
8        S009   12      5                71.01               75.89   
9        S010   14      8                90.63               65.30   
10       S011   13      8                78.49               85.47   
11       S012   12      5                80.13               80.90   
12       S013   14      5                79.08               88.43   
13       S014   11  

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [11]:
df = pd.read_csv("student_math_synthetic.csv")
df.head()


Unnamed: 0,student_id,age,grade,homework_completion,practice_tests_avg,class_participation,geometry_score,algebra1_score,recent_quiz_avg,study_hours_per_week,at_risk
0,S001,13,6,100.0,59.98,10.0,75.27,56.92,55.72,6.02,1
1,S002,14,8,90.39,70.78,10.0,86.05,81.76,71.16,8.87,0
2,S003,12,8,74.63,100.0,10.0,69.6,68.65,73.87,6.63,0
3,S004,14,5,83.1,84.61,9.42,56.77,68.64,51.37,4.9,0
4,S005,14,6,76.24,66.43,9.05,67.55,81.48,75.01,4.63,0


# New section