# Student Wellbeing — EDA + Linear Regression

Beginner-friendly notebook.

In [None]:
# Load cleaned dataset
import pandas as pd
clean = pd.read_csv('student_wellbeing_dataset_cleaned.csv')
clean.head()

## Linear Regression (predicting CGPA)
We use Hours, Sleep, Screen Time, Attendance, Extracurricular, and Stress as predictors.

In [None]:
# Prepare features and train-test split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

cat_cols = ['Extracurricular_Activities','Stress_Level']
ohe = OneHotEncoder(drop='first', sparse=False)
ohe_arr = ohe.fit_transform(clean[cat_cols])
import pandas as pd
ohe_cols = ohe.get_feature_names_out(cat_cols)
ohe_df = pd.DataFrame(ohe_arr, columns=ohe_cols, index=clean.index)
X = pd.concat([clean[['Hours_of_Study_per_day','Average_Sleep_Hours','Daily_Screen_Time','Attendance_Percentage']], ohe_df], axis=1)
y = clean['CGPA']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('RMSE:', mean_squared_error(y_test, y_pred, squared=False))
print('R2:', r2_score(y_test, y_pred))

In [None]:
# Predicted vs Actual plot
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([0,10],[0,10], linestyle='--')
plt.xlabel('Actual CGPA')
plt.ylabel('Predicted CGPA')
plt.title('Predicted vs Actual CGPA')
plt.show()

In [None]:
# Residuals histogram
residuals = y_test - y_pred
plt.figure(figsize=(6,4))
plt.hist(residuals, bins=30)
plt.xlabel('Residual (Actual - Predicted)')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.show()

## Coefficients (feature influence)

In [None]:
coeffs = pd.Series(lr.coef_, index=X.columns).sort_values(ascending=False)
coeffs