In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!mkdir -p data-quality-and-feature-engineering/datasets
!mkdir -p data-quality-and-feature-engineering/notebooks
!mkdir -p data-quality-and-feature-engineering/scripts

In [3]:
!kaggle datasets download -d spscientist/students-performance-in-exams

Dataset URL: https://www.kaggle.com/datasets/spscientist/students-performance-in-exams
License(s): unknown
Downloading students-performance-in-exams.zip to /content
  0% 0.00/8.70k [00:00<?, ?B/s]
100% 8.70k/8.70k [00:00<00:00, 30.4MB/s]


In [4]:
!unzip students-performance-in-exams.zip -d data-quality-and-feature-engineering/datasets/

Archive:  students-performance-in-exams.zip
  inflating: data-quality-and-feature-engineering/datasets/StudentsPerformance.csv  


In [5]:
!ls data-quality-and-feature-engineering/datasets

StudentsPerformance.csv


In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv(
    "data-quality-and-feature-engineering/datasets/StudentsPerformance.csv"
)
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [10]:
df["total_score"] = (
    df["math score"] +
    df["reading score"] +
    df["writing score"]
)
df[["math score", "reading score", "writing score", "total_score"]].head()

Unnamed: 0,math score,reading score,writing score,total_score
0,72,72,74,218
1,69,90,88,247
2,90,95,93,278
3,47,57,44,148
4,76,78,75,229


In [11]:
df["average_score"] = df["total_score"] / 3

In [12]:
df["performance_level"] = pd.cut(
    df["average_score"],
    bins=[0, 50, 75, 100],
    labels=["Low", "Medium", "High"]
)
df["performance_level"].value_counts()

Unnamed: 0_level_0,count
performance_level,Unnamed: 1_level_1
Medium,578
High,313
Low,109


In [13]:
df["test_prep_completed"] = df["test preparation course"].apply(
    lambda x: 1 if x == "completed" else 0
)

In [15]:
df["gender_encoded"] = df["gender"].map({
    "female": 0,
    "male": 1
})
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total_score,average_score,performance_level,test_prep_completed,gender_encoded
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667,Medium,0,0
1,female,group C,some college,standard,completed,69,90,88,247,82.333333,High,1,0
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667,High,0,0
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333,Low,0,1
4,male,group C,some college,standard,none,76,78,75,229,76.333333,High,0,1


## Feature Engineering Observations

- New aggregate features such as total score and average score were created.
- Student performance was categorized into Low, Medium, and High levels using score binning.
- Categorical variables like gender and test preparation course were transformed into numerical features.
- Feature engineering improves model interpretability and helps machine learning algorithms learn better patterns.