In [83]:
import numpy as np
import pandas as pd

Handling Null Values

In [84]:
student_table = pd.read_csv('studentlifedata.csv')
student_table.head(3)

Unnamed: 0,Timestamp,Username,University,Degree,Avg_Attendance,Avg_Study_time_per_day,Extracurricular_Activities,Avg_sleep_time,Household_Size,Workout,Free_time_Activity,University_society_member,Parents_Highest_Qaulification,GPA
0,2023/11/22 9:12:38 PM GMT+5,rayyanlaeeq@gmail.com,Fast,AI,90% - 100%,,Sports,5-7 hours,5-6 members,"Yes, regularly",Reading,No,Master's Degree,3.0 - 3.9
1,2023/11/23 8:25:45 PM GMT+5,,Fast NU,BS AI,90% - 100%,,Sports,5-7 hours,5-6 members,"Yes, regularly",Watching TV/Movies,Yes,Master's Degree,3.0 - 3.9
2,2023/11/23 8:25:48 PM GMT+5,,FAST-NU,BS-AI,90% - 100%,,Gaming,5-7 hours,5-6 members,"Yes, regularly",Watching TV/Movies,No,Master's Degree,3.0 - 3.9


In [85]:
print(student_table.shape)

(209, 14)


In [86]:
student_table.isnull().sum()

Timestamp                          0
Username                         208
University                         0
Degree                             0
Avg_Attendance                     0
Avg_Study_time_per_day             3
Extracurricular_Activities         4
Avg_sleep_time                     0
Household_Size                     0
Workout                            0
Free_time_Activity                 0
University_society_member          0
Parents_Highest_Qaulification      0
GPA                                0
dtype: int64

Dropping Unnecessary columns

In [87]:
student_table = student_table.drop(['Timestamp','Username','University'],axis=1)
student_table.shape

(209, 11)

Dropping Rows containing null values in Avg_Study_time_per_day column

In [88]:
print("Shape of dataframe before dropping:", student_table.shape)
student_table = student_table.dropna(axis = 0, subset = ['Avg_Study_time_per_day'])
print("Shape after dropping:", student_table.shape)

Shape of dataframe before dropping: (209, 11)
Shape after dropping: (206, 11)


In [89]:
student_table.isnull().sum()

Degree                           0
Avg_Attendance                   0
Avg_Study_time_per_day           0
Extracurricular_Activities       4
Avg_sleep_time                   0
Household_Size                   0
Workout                          0
Free_time_Activity               0
University_society_member        0
Parents_Highest_Qaulification    0
GPA                              0
dtype: int64

In [90]:
student_table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 206 entries, 3 to 208
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Degree                         206 non-null    object
 1   Avg_Attendance                 206 non-null    object
 2   Avg_Study_time_per_day         206 non-null    object
 3   Extracurricular_Activities     202 non-null    object
 4   Avg_sleep_time                 206 non-null    object
 5   Household_Size                 206 non-null    object
 6   Workout                        206 non-null    object
 7   Free_time_Activity             206 non-null    object
 8   University_society_member      206 non-null    object
 9   Parents_Highest_Qaulification  206 non-null    object
 10  GPA                            206 non-null    object
dtypes: object(11)
memory usage: 19.3+ KB


Standardizing the values of column Degree, since there are numerous variations of similar degree

In [91]:
from rapidfuzz import process

In [92]:
standardized_degrees = ['Bachelors of Artificial Intelligence', 'Bachelors of Computer Science', 'Bachelors of Cyber Security', 'Bachelors of Software Engineering', 'Bachelors of Information Technology', 'Bachelors of Biotechnology', 'BBA', 'Bachelors of Electrical Engineering', 'Bachelors of Civil Engineering', 'Bachelors of Geography', 'Biology', 'Bachelors of Media Science', 'Bachelors of Avionics', 'Bachelors of Finance and Accounting', 'Doctor of Pharmacy', 'Doctor of Physical Therapy', 'Bachelors of Physics', 'Bachelors of Textile Engineering', 'Bachelors of Maritime', 'Bachelors of Industrial']


# Function to match degrees
def standardize_degree(value, choices):
    if pd.notnull(value):
        match = process.extractOne(value, choices)  # Get the best match
        return match[0] if match[1] >= 80 else 'Other'  # Set a threshold for matching
    return value

# Apply the function to the 'degree' column
student_table['Degree'] = student_table['Degree'].apply(lambda x: standardize_degree(x, standardized_degrees))
student_table['Degree'].head(7)

3                                   Other
4    Bachelors of Artificial Intelligence
5    Bachelors of Artificial Intelligence
6    Bachelors of Artificial Intelligence
7                                   Other
8                                   Other
9                                   Other
Name: Degree, dtype: object

In [93]:
student_table['Degree'].unique()

array(['Other', 'Bachelors of Artificial Intelligence',
       'Bachelors of Cyber Security', 'Bachelors of Computer Science',
       'Bachelors of Physics', 'BBA', 'Biology',
       'Bachelors of Software Engineering',
       'Bachelors of Information Technology',
       'Bachelors of Media Science', 'Bachelors of Geography',
       'Bachelors of Finance and Accounting', 'Doctor of Pharmacy',
       'Bachelors of Biotechnology', 'Bachelors of Textile Engineering',
       'Bachelors of Civil Engineering',
       'Bachelors of Electrical Engineering',
       'Doctor of Physical Therapy'], dtype=object)

In [94]:
student_table.loc[student_table['Degree']=='Other']

Unnamed: 0,Degree,Avg_Attendance,Avg_Study_time_per_day,Extracurricular_Activities,Avg_sleep_time,Household_Size,Workout,Free_time_Activity,University_society_member,Parents_Highest_Qaulification,GPA
3,Other,80%-90%,5-6 hours,Academic Clubs,5-7 hours,3-4 members,Occasionally,Watching TV/Movies,Yes,Bachelor's Degree,2.0 - 2.9
7,Other,80%-90%,Less than 1 hour,Sports,5-7 hours,5-6 members,Occasionally,Playing Games,Yes,Bachelor's Degree,2.0 - 2.9
8,Other,80%-90%,1-2 hours,Sports,Less than 5 hours,5-6 members,"No, I don't",Socializing with Friends,No,High School Diploma or Below,2.0 - 2.9
9,Other,80%-90%,1-2 hours,Sports,5-7 hours,5-6 members,"No, I don't",Reading,No,High School Diploma or Below,3.0 - 3.9
10,Other,80%-90%,3-4 hours,Work on my skills but i am not consistent with...,5-7 hours,5-6 members,"No, I don't",Socializing with Friends,Yes,High School Diploma or Below,3.0 - 3.9
...,...,...,...,...,...,...,...,...,...,...,...
201,Other,80%-90%,Less than 1 hour,Sports,5-7 hours,5-6 members,Occasionally,Socializing with Friends,No,High School Diploma or Below,3.0 - 3.9
202,Other,80%-90%,3-4 hours,Arts and Culture,5-7 hours,7 or more members,Occasionally,Watching TV/Movies,No,High School Diploma or Below,2.0 - 2.9
203,Other,90% - 100%,3-4 hours,Volunteer Work,5-7 hours,7 or more members,"Yes, regularly",Playing football,No,High School Diploma or Below,2.0 - 2.9
204,Other,90% - 100%,1-2 hours,Arts and Culture;Volunteer Work,7-9 hours,7 or more members,"No, I don't",Reading,No,High School Diploma or Below,3.0 - 3.9


One-Hot Encoding Degree column

In [95]:
degree_enc = pd.get_dummies(student_table['Degree'])
degree_enc.head()

Unnamed: 0,BBA,Bachelors of Artificial Intelligence,Bachelors of Biotechnology,Bachelors of Civil Engineering,Bachelors of Computer Science,Bachelors of Cyber Security,Bachelors of Electrical Engineering,Bachelors of Finance and Accounting,Bachelors of Geography,Bachelors of Information Technology,Bachelors of Media Science,Bachelors of Physics,Bachelors of Software Engineering,Bachelors of Textile Engineering,Biology,Doctor of Pharmacy,Doctor of Physical Therapy,Other
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
4,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
