In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!mkdir -p data-quality-and-feature-engineering/datasets

In [9]:
!kaggle datasets download -d spscientist/students-performance-in-exams

Dataset URL: https://www.kaggle.com/datasets/spscientist/students-performance-in-exams
License(s): unknown
Downloading students-performance-in-exams.zip to /content
  0% 0.00/8.70k [00:00<?, ?B/s]
100% 8.70k/8.70k [00:00<00:00, 31.5MB/s]


In [10]:
!unzip -o students-performance-in-exams.zip -d data-quality-and-feature-engineering/datasets/

Archive:  students-performance-in-exams.zip
  inflating: data-quality-and-feature-engineering/datasets/StudentsPerformance.csv  


In [11]:
!ls data-quality-and-feature-engineering/datasets

StudentsPerformance.csv


In [12]:
df = pd.read_csv(
    "data-quality-and-feature-engineering/datasets/StudentsPerformance.csv"
)
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [13]:
numerical_features = [
    "math score",
    "reading score",
    "writing score"
]

In [14]:
scaler = StandardScaler()
df_scaled = df.copy()

df_scaled[numerical_features] = scaler.fit_transform(
    df[numerical_features]
)
df_scaled[numerical_features].head()

Unnamed: 0,math score,reading score,writing score
0,0.390024,0.193999,0.391492
1,0.192076,1.427476,1.313269
2,1.577711,1.770109,1.642475
3,-1.259543,-0.833899,-1.583744
4,0.653954,0.605158,0.457333


In [16]:
minmax_scaler = MinMaxScaler()
df_minmax = df.copy()

df_minmax[numerical_features] = minmax_scaler.fit_transform(
    df[numerical_features]
)
df_minmax[numerical_features].head()

Unnamed: 0,math score,reading score,writing score
0,0.72,0.662651,0.711111
1,0.69,0.879518,0.866667
2,0.9,0.939759,0.922222
3,0.47,0.481928,0.377778
4,0.76,0.73494,0.722222


In [19]:
categorical_features = [
    "gender",
    "race/ethnicity",
    "parental level of education",
    "lunch",
    "test preparation course"
]
df_encoded = pd.get_dummies(
    df,
    columns=categorical_features,
    drop_first=True
)
df_encoded.head()

Unnamed: 0,math score,reading score,writing score,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none
0,72,72,74,False,True,False,False,False,True,False,False,False,False,True,True
1,69,90,88,False,False,True,False,False,False,False,False,True,False,True,False
2,90,95,93,False,True,False,False,False,False,False,True,False,False,True,True
3,47,57,44,True,False,False,False,False,False,False,False,False,False,False,True
4,76,78,75,True,False,True,False,False,False,False,False,True,False,True,True


In [20]:
df_encoded.shape

(1000, 15)

## Feature Scaling and Encoding Observations

- Numerical features were scaled using StandardScaler and MinMaxScaler.
- Scaling ensures that features contribute equally to machine learning models.
- Categorical variables were transformed using one-hot encoding.
- After encoding, the dataset became fully numerical and suitable for machine learning algorithms.