In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import kagglehub
import os, pandas as pd, numpy as np
from joblib import dump, load

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download latest version
path = kagglehub.dataset_download(
    "nikhil7280/student-performance-multiple-linear-regression"
)

print("Path to dataset files:", path)

Path to dataset files: /home/rohnak.agarwal/.cache/kagglehub/datasets/nikhil7280/student-performance-multiple-linear-regression/versions/1


In [3]:
df = pd.read_csv(os.path.join(path, "Student_Performance.csv"))
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [4]:
cat_cols = df.select_dtypes(include=["object"]).columns
num_cols = df.select_dtypes(include=[int, float]).columns

print(cat_cols, num_cols)

Index(['Extracurricular Activities'], dtype='object') Index(['Hours Studied', 'Previous Scores', 'Sleep Hours',
       'Sample Question Papers Practiced', 'Performance Index'],
      dtype='object')


In [5]:
df_1hot = pd.get_dummies(df, columns=cat_cols, dtype=float, drop_first=True)
df_1hot.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index,Extracurricular Activities_Yes
0,7,99,9,1,91.0,1.0
1,4,82,4,2,65.0,0.0
2,8,51,7,2,45.0,1.0
3,5,52,5,2,36.0,1.0
4,7,75,8,5,66.0,0.0


In [6]:
df_1hot[num_cols] = df_1hot[num_cols].astype(float)
df_1hot.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index,Extracurricular Activities_Yes
0,7.0,99.0,9.0,1.0,91.0,1.0
1,4.0,82.0,4.0,2.0,65.0,0.0
2,8.0,51.0,7.0,2.0,45.0,1.0
3,5.0,52.0,5.0,2.0,36.0,1.0
4,7.0,75.0,8.0,5.0,66.0,0.0


In [7]:
target_colname = "Performance Index"

X = df_1hot.drop(target_colname, axis=1)
y = df_1hot[target_colname]

In [8]:
X.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Extracurricular Activities_Yes
0,7.0,99.0,9.0,1.0,1.0
1,4.0,82.0,4.0,2.0,0.0
2,8.0,51.0,7.0,2.0,1.0
3,5.0,52.0,5.0,2.0,1.0
4,7.0,75.0,8.0,5.0,0.0


In [9]:
y.head()

0    91.0
1    65.0
2    45.0
3    36.0
4    66.0
Name: Performance Index, dtype: float64

In [10]:
!mkdir -p ./model ./data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=327
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

num_cols = list(set(num_cols) & set(X.columns))
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

(8000, 5) (2000, 5) (8000,) (2000,)


In [12]:
dump(scaler, "./model/stdscaler.joblib")
pd.DataFrame(
    list(zip(scaler.feature_names_in_, scaler.mean_, scaler.scale_)),
    columns=["param", "mean", "scale"],
)

Unnamed: 0,param,mean,scale
0,Previous Scores,69.450125,17.315514
1,Sleep Hours,6.522625,1.692775
2,Sample Question Papers Practiced,4.5815,2.870385
3,Hours Studied,4.994375,2.574266


In [13]:
scaler = load("./model/stdscaler.joblib")
pd.DataFrame(
    list(zip(scaler.feature_names_in_, scaler.mean_, scaler.scale_)),
    columns=["param", "mean", "scale"],
)

Unnamed: 0,param,mean,scale
0,Previous Scores,69.450125,17.315514
1,Sleep Hours,6.522625,1.692775
2,Sample Question Papers Practiced,4.5815,2.870385
3,Hours Studied,4.994375,2.574266


In [14]:
X_train.to_csv("./data/x_train.csv", index=0)
X_test.to_csv("./data/x_test.csv", index=0)
y_train.to_csv("./data/y_train.csv", index=0)
y_test.to_csv("./data/y_test.csv", index=0)