In [11]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

#Set file path 
filename = "Student_performance_data .csv"

#Load dataset
try:
    data = pd.read_csv(filename)
except FileNotFoundError:
    raise FileNotFoundError(f"File '{filename}' not found. Please check the path.")

#Check for 'GradeClass' column
if "GradeClass" not in data.columns:
    raise ValueError("'GradeClass' column not found in dataset. Cannot stratify.")

#Split into train/test sets (80/20) with stratified sampling
train_data, test_data = train_test_split(
    data,
    test_size=0.2,
    random_state=53,         
    stratify=data["GradeClass"]
)

#Save to CSV files inside the existing Data folder
output_path = "../Data"
os.makedirs(output_path, exist_ok=True)

train_data.to_csv(os.path.join(output_path, "train_data.csv"), index=False)
test_data.to_csv(os.path.join(output_path, "test_data.csv"), index=False)

#Print class distribution info
print("\nGradeClass Distribution in Training Set:")
counts = train_data["GradeClass"].value_counts().sort_index()
percentages = (counts / len(train_data) * 100).round(2)
distribution_df = pd.DataFrame({
    "Count": counts,
    "Percentage (%)": percentages
})
print(distribution_df)



GradeClass Distribution in Training Set:
            Count  Percentage (%)
GradeClass                       
0.0            86            4.50
1.0           215           11.24
2.0           313           16.36
3.0           331           17.30
4.0           968           50.60
