In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Set the name of the dataset file (should be in the same folder as this script)
filename = "Student_performance_data.csv"

# Try loading the dataset — if it’s missing, let the user know
try:
    data = pd.read_csv(filename)
except FileNotFoundError:
    raise FileNotFoundError(f"File '{filename}' not found. Please check the path.")

# Make sure the column we're stratifying by actually exists
if "GradeClass" not in data.columns:
    raise ValueError("'GradeClass' column not found in dataset. Cannot stratify.")

# Split the data into training and testing sets (80/20 split)
# Stratify makes sure the class distribution stays balanced across both sets
train_data, test_data = train_test_split(
    data,
    test_size=0.2,
    random_state=53,         
    stratify=data["GradeClass"]
)

# Set the output path and make sure the folder exists
output_path = "../Data"
os.makedirs(output_path, exist_ok=True)

# Save both splits to CSV files for future use
train_data.to_csv(os.path.join(output_path, "train_data.csv"), index=False)
test_data.to_csv(os.path.join(output_path, "test_data.csv"), index=False)

# Print a quick breakdown of how many students ended up in each grade for the training set
print("\nGradeClass Distribution in Training Set:")
counts = train_data["GradeClass"].value_counts().sort_index()
percentages = (counts / len(train_data) * 100).round(2)
distribution_df = pd.DataFrame({
    "Count": counts,
    "Percentage (%)": percentages
})
print(distribution_df)



GradeClass Distribution in Training Set:
            Count  Percentage (%)
GradeClass                       
0.0            86            4.50
1.0           215           11.24
2.0           313           16.36
3.0           331           17.30
4.0           968           50.60
