# Importing required libraries

In [33]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Reading Dataframe

In [34]:
df = pd.read_csv('StudentsPerformance.csv') 

In [35]:
df.head(10) # Checking top 10 rows

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72.0,74.0
1,female,group C,some college,standard,completed,69,90.0,88.0
2,female,group B,master's degree,standard,none,90,95.0,93.0
3,male,group A,associate's degree,free/reduced,none,47,57.0,44.0
4,male,group C,some college,standard,none,76,78.0,75.0
5,female,group B,associate's degree,standard,none,71,83.0,78.0
6,female,group B,some college,standard,completed,88,95.0,92.0
7,male,group B,some college,free/reduced,none,40,43.0,39.0
8,male,group D,high school,free/reduced,completed,64,64.0,67.0
9,female,group B,high school,free/reduced,none,38,60.0,50.0


# Data Preprocessing

In [36]:
df.isna() # Checking if value is null

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False


# Calculating total null values per column

In [37]:
df.isna().sum() 

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     8
reading score                  6
writing score                  9
dtype: int64

In [38]:
df.describe()

Unnamed: 0,reading score,writing score
count,994.0,991.0
mean,68.008048,69.487386
std,16.60227,29.563757
min,3.0,10.0
25%,58.0,57.0
50%,69.5,69.0
75%,79.0,79.0
max,100.0,567.0


# Checking datatypes

In [39]:
df.dtypes

gender                          object
race/ethnicity                  object
parental level of education     object
lunch                           object
test preparation course         object
math score                      object
reading score                  float64
writing score                  float64
dtype: object

# Separating Numerical and Object data

In [40]:
numerical_var = df.columns[df.dtypes!='object']
categorical_var = df.columns[df.dtypes=='object']
print(numerical_var)
print(categorical_var)

Index(['reading score', 'writing score'], dtype='object')
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score'],
      dtype='object')


# Priting Numerical Data

In [41]:
df[numerical_var]

Unnamed: 0,reading score,writing score
0,72.0,74.0
1,90.0,88.0
2,95.0,93.0
3,57.0,44.0
4,78.0,75.0
...,...,...
995,99.0,95.0
996,55.0,55.0
997,71.0,65.0
998,78.0,77.0


# Printing Categorical Data

In [42]:
df[categorical_var]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score
0,female,group B,bachelor's degree,standard,none,72
1,female,group C,some college,standard,completed,69
2,female,group B,master's degree,standard,none,90
3,male,group A,associate's degree,free/reduced,none,47
4,male,group C,some college,standard,none,76
...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88
996,male,group C,high school,free/reduced,none,62
997,female,group C,high school,free/reduced,completed,59
998,female,group D,some college,standard,completed,68


# Shape of dataframe in [rows, columns]

In [43]:
df.shape

(1000, 8)

# Size of dataframe (Number of cells)

In [44]:
df.size

8000

# Data Formatting

In [45]:
df1 = df.copy()

# Dropping null values by creating shallow copy

In [46]:
df1 = df1.dropna()

# Checking null values

In [47]:
df1.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [48]:
df1 = df1.drop(df1[df1["math score"] == "?"].index)
df1 = df1.drop(df1[df1["math score"] < 0].index)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race/ethnicity               1000 non-null   object 
 2   parental level of education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test preparation course      1000 non-null   object 
 5   math score                   992 non-null    object 
 6   reading score                994 non-null    float64
 7   writing score                991 non-null    float64
dtypes: float64(2), object(6)
memory usage: 62.6+ KB


# Converting math score type into int64

In [54]:
df1["math score"] = df1["math score"].astype("int64")

In [55]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 942 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       942 non-null    object 
 1   race/ethnicity               942 non-null    object 
 2   parental level of education  942 non-null    object 
 3   lunch                        942 non-null    object 
 4   test preparation course      942 non-null    object 
 5   math score                   942 non-null    int64  
 6   reading score                942 non-null    float64
 7   writing score                942 non-null    float64
dtypes: float64(2), int64(1), object(5)
memory usage: 66.2+ KB


In [56]:
df1.describe()

Unnamed: 0,math score,reading score,writing score
count,942.0,942.0,942.0
mean,66.43949,68.468153,69.629512
std,17.667794,15.988856,30.124568
min,0.0,3.0,10.0
25%,57.0,59.0,58.0
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,334.0,100.0,567.0


# Label Encoding for test preparation course

In [60]:
df['test preparation course'].replace(['none', 'completed'],
                        [0, 1], inplace=True)

In [61]:
df[categorical_var]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score
0,female,group B,bachelor's degree,standard,0,72
1,female,group C,some college,standard,1,69
2,female,group B,master's degree,standard,0,90
3,male,group A,associate's degree,free/reduced,0,47
4,male,group C,some college,standard,0,76
...,...,...,...,...,...,...
995,female,group E,master's degree,standard,1,88
996,male,group C,high school,free/reduced,0,62
997,female,group C,high school,free/reduced,1,59
998,female,group D,some college,standard,1,68


# Data Normalization - MinMax

In [73]:
df2 = df[['reading score', 'writing score']].copy()

In [74]:
def min_max_scaling(data):
    min_val = min(data)
    max_value = max(data)
    normalized_data = [(x - min_value) / (max_value - min_value) for x in data]
    return normalized_data

In [75]:
normal_df = NormalizeData(df2)

In [76]:
print("Normalized Dataframe : \n", normal_df)

Normalized Dataframe : 
      reading score  writing score
0         0.711340       0.114901
1         0.896907       0.140036
2         0.948454       0.149013
3         0.556701       0.061041
4         0.773196       0.116697
..             ...            ...
995       0.989691       0.152603
996       0.536082       0.080790
997       0.701031       0.098743
998       0.773196       0.120287
999       0.855670       0.136445

[1000 rows x 2 columns]
