# LAB 2 : Data Preprocessing Tools

# Import Liabraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import Dataset

In [2]:
dataset = pd.read_csv("Students_Performance_mv.csv")

## EDA Steps

In [3]:
dataset.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
dataset.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [5]:
dataset.shape

(1000, 8)

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               989 non-null    object
 2   parental level of education  979 non-null    object
 3   lunch                        988 non-null    object
 4   test preparation course      996 non-null    object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [7]:
# for numerical feaatures
dataset.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [8]:
# categorical features
dataset.describe(include = 'object')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
count,1000,989,979,988,996
unique,2,5,6,2,2
top,female,group C,associate's degree,standard,none
freq,518,315,219,637,639


## Preprocessing Steps

### Step 1 : Divide dataframe into input and output Featutres

In [11]:
X = dataset.iloc[:, :-1]  # All columns except the last one
Y = dataset.iloc[:, -1]   # Only the last column

In [12]:
print(X)

     gender race/ethnicity parental level of education         lunch  \
0    female        group B           bachelor's degree      standard   
1    female        group C                some college      standard   
2    female        group B             master's degree      standard   
3      male        group A          associate's degree  free/reduced   
4      male        group C                some college      standard   
..      ...            ...                         ...           ...   
995  female        group E             master's degree      standard   
996    male        group C                 high school  free/reduced   
997  female        group C                 high school  free/reduced   
998  female        group D                some college      standard   
999  female        group D                some college  free/reduced   

    test preparation course  math score  reading score  
0                      none          72             72  
1                 com

In [13]:
print(Y)

0      74
1      88
2      93
3      44
4      75
       ..
995    95
996    55
997    65
998    77
999    86
Name: writing score, Length: 1000, dtype: int64


### Step 2: Handle the missing values in Dataset

In [None]:
from sklearn.impute import SimpleImputer

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute missing values in numerical columns using mean
num_imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

# Impute missing values in categorical columns using mode (most frequent value)
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Check if missing values are handled
missing_values_after_imputation = df.isnull().sum()

In [None]:
print(X)

## Step 3 : Encoding categorical data

#### A. Encoding the Independent Variable (i/p feature/X)

In X we have gender, lunch and test preparation course as categorical feature

Hence used One hot encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# Identifying categorical columns
categorical_columns = [0, 1, 2, 3, 4]
# Apply One-Hot Encoding using ColumnTransformer
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), categorical_columns)],
    remainder='passthrough'
)

# Transform X
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

### Step 4 : Splitting Data into Training and Testing

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,
                                                Y,
                                        test_size =0.3,
                                        random_state = 1)

In [None]:
print(X_train)

### Step 5 : Feature Scaling

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np


ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_encoded = ohe.fit_transform(X_train[:, 3].reshape(-1, 1))
X_test_encoded = ohe.transform(X_test[:, 3].reshape(-1, 1))

# Convert to NumPy and concatenate back with numerical columns
X_train = np.hstack((X_train[:, :3], X_train_encoded, X_train[:, 4:]))
X_test = np.hstack((X_test[:, :3], X_test_encoded, X_test[:, 4:]))

In [None]:
print(X_train)