This notebook is used to create cleaner dataset for training models later. I have choosen clean dataset for convenience
- No feature selection (Kbest, correlation checks) aka use all features provided
- save as pkl file in ./data/processed

# Import

In [10]:
import pandas as pd

# Display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [11]:
# Helper
def check_df(df):
    print("Total row count:", len(df))
    display(df.head())
    display(df.info())
    # Find columns with dtype 'object'
    object_columns = df.select_dtypes(include=['object']).columns

    # Get unique values for each object column
    unique_values = {col: df[col].value_counts(normalize=True) for col in object_columns}

    print("\nUnique values for object columns:")
    for col, values in unique_values.items():
        print(f"{col}: {values.to_dict()}")

# Dataset
- [accident.csv - binary classification](./data/raw/accident.csv)
- [iris_synthetic_data.csv - multiclass classification](./data/raw/iris_synthetic_data.csv)
- [Student_Performance.csv - regression classification](./data/raw/Student_Performance.csv)

## Accident (Binary)

In [22]:
dataset_path = "./data/raw/accident.csv"
df_raw = pd.read_csv(dataset_path)
check_df(df_raw)

Total row count: 200


Unnamed: 0,Age,Gender,Speed_of_Impact,Helmet_Used,Seatbelt_Used,Survived
0,56,Female,27.0,No,No,1
1,69,Female,46.0,No,Yes,1
2,46,Male,46.0,Yes,Yes,0
3,32,Male,117.0,No,Yes,0
4,60,Female,40.0,Yes,Yes,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              200 non-null    int64  
 1   Gender           199 non-null    object 
 2   Speed_of_Impact  197 non-null    float64
 3   Helmet_Used      200 non-null    object 
 4   Seatbelt_Used    200 non-null    object 
 5   Survived         200 non-null    int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 9.5+ KB


None


Unique values for object columns:
Gender: {'Female': 0.5527638190954773, 'Male': 0.4472361809045226}
Helmet_Used: {'Yes': 0.555, 'No': 0.445}
Seatbelt_Used: {'Yes': 0.555, 'No': 0.445}


In [23]:
df = df_raw.copy()
# Remove null
df = df.dropna(subset=['Gender', 'Speed_of_Impact'])

# Encode Categorical Features
df['Gender - isMale'] = (df['Gender'] == 'Male').astype(int)
df['Helmet_Used'] = (df['Helmet_Used'] == 'Yes').astype(int)
df['Seatbelt_Used'] = (df['Seatbelt_Used'] == 'Yes').astype(int)

# Remove cols
df.drop(columns=["Gender"], inplace=True)

# Save to pickle
display(df.info())
display(df.head())
df.to_pickle('./data/processed/accident.pkl')

<class 'pandas.core.frame.DataFrame'>
Index: 196 entries, 0 to 199
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              196 non-null    int64  
 1   Speed_of_Impact  196 non-null    float64
 2   Helmet_Used      196 non-null    int64  
 3   Seatbelt_Used    196 non-null    int64  
 4   Survived         196 non-null    int64  
 5   Gender - isMale  196 non-null    int64  
dtypes: float64(1), int64(5)
memory usage: 10.7 KB


None

Unnamed: 0,Age,Speed_of_Impact,Helmet_Used,Seatbelt_Used,Survived,Gender - isMale
0,56,27.0,0,0,1,0
1,69,46.0,0,1,1,0
2,46,46.0,1,1,0,1
3,32,117.0,0,1,0,1
4,60,40.0,1,1,0,0


## Iris (multi-class)

In [24]:
dataset_path = "./data/raw/iris_synthetic_data.csv"
df_raw = pd.read_csv(dataset_path)
check_df(df_raw)

Total row count: 3000


Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.2,3.8,1.5,0.3,Iris-setosa
1,5.3,4.1,1.5,0.1,Iris-setosa
2,4.8,3.1,1.5,0.2,Iris-setosa
3,5.2,3.7,1.5,0.2,Iris-setosa
4,4.9,3.0,1.5,0.3,Iris-setosa


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal length  3000 non-null   float64
 1   sepal width   3000 non-null   float64
 2   petal length  3000 non-null   float64
 3   petal width   3000 non-null   float64
 4   label         3000 non-null   object 
dtypes: float64(4), object(1)
memory usage: 117.3+ KB


None


Unique values for object columns:
label: {'Iris-setosa': 0.3333333333333333, 'Iris-versicolor': 0.3333333333333333, 'Iris-virginica': 0.3333333333333333}


In [25]:
df = df_raw.copy()
# Encode Categorical Features
label_order = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
df['label'] = df['label'].map(label_order)

# Save to pickle
display(df.info())
display(df.head())
df.to_pickle('./data/processed/iris.pkl')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal length  3000 non-null   float64
 1   sepal width   3000 non-null   float64
 2   petal length  3000 non-null   float64
 3   petal width   3000 non-null   float64
 4   label         3000 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 117.3 KB


None

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.2,3.8,1.5,0.3,0
1,5.3,4.1,1.5,0.1,0
2,4.8,3.1,1.5,0.2,0
3,5.2,3.7,1.5,0.2,0
4,4.9,3.0,1.5,0.3,0


## Student Performance (Regression)

In [16]:
dataset_path = "./data/raw/Student_Performance.csv"
df_raw = pd.read_csv(dataset_path)
check_df(df_raw)

Total row count: 10000


Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


None


Unique values for object columns:
Extracurricular Activities: {'No': 0.5052, 'Yes': 0.4948}


In [17]:
df = df_raw.copy()
# Encode Categorical Features
df['Extracurricular Activities'] = (df['Extracurricular Activities'] == 'Yes').astype(int)

# Save to pickle
display(df.info())
display(df.head())
df.to_pickle('./data/processed/student_performance.pkl')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  int64  
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 468.9 KB


None

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0
