# Encoding

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("/notebooks/Datasets/Cleaning/data4.csv")

In [3]:
data.head()

Unnamed: 0,StudentID,Gender,Age,Country,Lessons Completed,Quiz Score,Time Per Week,Result
0,S001,Male,20.0,India,25,78.5,10,Pass
1,S002,Female,22.0,USA,30,82.0,12,Pass
2,S003,Male,19.0,Sri Lanka,18,45.0,8,Fail
3,S004,Female,21.0,India,28,88.0,9,Pass
4,S005,Female,18.0,USA,15,35.0,5,Fail


## Ordinal encoding

In [4]:
from sklearn.preprocessing import OrdinalEncoder

In [5]:
data['Result'].unique()

array(['Pass', 'Fail'], dtype=object)

In [6]:
results_order = [['Fail', 'Pass']]

In [7]:
ordinal_encoder = OrdinalEncoder(categories=results_order)

data['Result_encoded'] = ordinal_encoder.fit_transform(data[['Result']])

In [8]:
data.head()

Unnamed: 0,StudentID,Gender,Age,Country,Lessons Completed,Quiz Score,Time Per Week,Result,Result_encoded
0,S001,Male,20.0,India,25,78.5,10,Pass,1.0
1,S002,Female,22.0,USA,30,82.0,12,Pass,1.0
2,S003,Male,19.0,Sri Lanka,18,45.0,8,Fail,0.0
3,S004,Female,21.0,India,28,88.0,9,Pass,1.0
4,S005,Female,18.0,USA,15,35.0,5,Fail,0.0


In [9]:
data['Result_encoded'].unique()

array([1., 0.])

## One-Hot Encoding

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
onehot_encoder = OneHotEncoder(drop=None, handle_unknown='ignore', sparse_output=False)

In [12]:
onehot_encoded = onehot_encoder.fit_transform(data[['Gender', 'Country']])

In [13]:
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(['Gender', 'Country']))

In [14]:
encoded_df = pd.concat([data, onehot_encoded_df], axis=1)

In [15]:
encoded_df.head()

Unnamed: 0,StudentID,Gender,Age,Country,Lessons Completed,Quiz Score,Time Per Week,Result,Result_encoded,Gender_Female,Gender_Male,Country_India,Country_Sri Lanka,Country_USA
0,S001,Male,20.0,India,25,78.5,10,Pass,1.0,0.0,1.0,1.0,0.0,0.0
1,S002,Female,22.0,USA,30,82.0,12,Pass,1.0,1.0,0.0,0.0,0.0,1.0
2,S003,Male,19.0,Sri Lanka,18,45.0,8,Fail,0.0,0.0,1.0,0.0,1.0,0.0
3,S004,Female,21.0,India,28,88.0,9,Pass,1.0,1.0,0.0,1.0,0.0,0.0
4,S005,Female,18.0,USA,15,35.0,5,Fail,0.0,1.0,0.0,0.0,0.0,1.0


In [16]:
encoded_df.drop(['StudentID', 'Gender', 'Country', 'Result'], axis=1)

Unnamed: 0,Age,Lessons Completed,Quiz Score,Time Per Week,Result_encoded,Gender_Female,Gender_Male,Country_India,Country_Sri Lanka,Country_USA
0,20.0,25,78.5,10,1.0,0.0,1.0,1.0,0.0,0.0
1,22.0,30,82.0,12,1.0,1.0,0.0,0.0,0.0,1.0
2,19.0,18,45.0,8,0.0,0.0,1.0,0.0,1.0,0.0
3,21.0,28,88.0,9,1.0,1.0,0.0,1.0,0.0,0.0
4,18.0,15,35.0,5,0.0,1.0,0.0,0.0,0.0,1.0
5,,10,25.0,4,0.0,0.0,1.0,0.0,1.0,0.0
6,24.0,32,90.0,14,1.0,0.0,1.0,1.0,0.0,0.0
7,23.0,20,,11,1.0,1.0,0.0,0.0,0.0,1.0
8,25.0,29,77.0,10,1.0,0.0,1.0,1.0,0.0,0.0
9,20.0,14,30.0,6,0.0,1.0,0.0,0.0,1.0,0.0


In [17]:
encoded_df.to_csv("encoded_df.csv", index=False)