In [2]:
import numpy as np
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import AutoMinorLocator
import matplotlib.ticker as ticker

student = pandas.read_csv('StudentPerformanceFactors.csv', sep = ",")
student.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

To optimize our analysis we will try to eliminate all duplicate. Luckily no duplicate was detected.

In [4]:
previous_length = student.shape[0]
student.drop_duplicates(inplace = True)
if(previous_length > student.shape[0]):
    print(f"We erased {previous_length - student.shape[0]} duplicates.")

Dealing with variables that can have values of two or more types is not ideal. As we can see below, three of our variables have two types.

In [8]:
def check_mutliple_type(df: pandas.DataFrame):
    column_to_change = []
    types_per_column = []
    for col in student.columns:
        unique_types = df[col].apply(type).unique()
        if len(unique_types) > 1:
            column_to_change.append(col)
            types_per_column.append(unique_types)
            print(col, unique_types)
    return column_to_change
column_to_change = check_mutliple_type(student)

Teacher_Quality [<class 'str'> <class 'float'>]
Parental_Education_Level [<class 'str'> <class 'float'>]
Distance_from_Home [<class 'str'> <class 'float'>]


These variables are the same 3 that does not have 6607 non-null values. I will assume that all float values are only null values. I could just change the type of the variables to string and continue the analysis but there will still be some missing values. To continue the analysis I have to replace those missing values. I am not sure I will find the accurate values that can replace the missing values. So I will remove the lines with missing values but I will keep them in a separate file in order to test the AI we will build on these missing data.

In [11]:
#extraction of the values
lines_to_extract = pandas.DataFrame()
for col in column_to_change:
    lines_to_extract = pandas.concat([lines_to_extract, student.loc[student[col].isnull()]])
lines_to_extract.drop_duplicates()
lines_to_extract.to_csv('StudentPerformanceFactore_NAN_values.csv')
lines_to_extract.shape


(235, 20)

In [13]:
#remove extracted data
student = student.merge(lines_to_extract, how = "outer", 
                        indicator=True).query('_merge=="left_only"').drop(columns='_merge')
student.to_csv("StudentPerformanceFactors_Cleaned.csv")
student.shape

(6378, 20)

Now we can see that by extracting the row with missing values, all columns have a unique type. Therefore we can continue our analysis correctly.

In [16]:
column_to_change = check_mutliple_type(student)

The next thing I will do to clean the dataset is trying to find some extraordinary values that might distorting my analysis. For now I only check the quantitative variables. I only found 1 extraordinary value, that is not so extraordinary. I will assume that having 101 at an exam is possible if the teacher give bonus points. So I will keep that line in the dataset.

In [19]:
student.loc[student['Hours_Studied'] > 24*7]
print("----------------------------------------")
student.loc[student['Attendance'] > 100]
print("----------------------------------------")
student.loc[student['Sleep_Hours'] > 15]
print("----------------------------------------")
student.loc[student['Tutoring_Sessions'] > 30]
print("----------------------------------------")
student.loc[student['Physical_Activity'] > 24*7]
print("----------------------------------------")
student.loc[student['Previous_Scores'] > 100]
print("----------------------------------------")
student.loc[student['Exam_Score'] > 100]

----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
5920,27,98,Low,Medium,Yes,6,93,Low,No,5,High,High,Public,Positive,3,No,High School,Moderate,Female,101


To finish the dataset cleaning we will check the extraordinary values on qualitative variables based on the information given on the kaggle data card of the dataset. As we can see below, there is no extraordinary values so we are good to go !

In [22]:
print(student[~student['Parental_Involvement'].isin(['Low', 'Medium', 'High'])])
print("----------------------------------------")
print(student[~student['Access_to_Resources'].isin(['Low', 'Medium', 'High'])])
print("----------------------------------------")
print(student[~student['Extracurricular_Activities'].isin(['Yes', 'No'])])
print("----------------------------------------")
print(student[~student['Motivation_Level'].isin(['Low', 'Medium', 'High'])])
print("----------------------------------------")
print(student[~student['Internet_Access'].isin(['Yes', 'No'])])
print("----------------------------------------")
print(student[~student['Family_Income'].isin(['Low', 'Medium', 'High'])])
print("----------------------------------------")
print(student[~student['Teacher_Quality'].isin(['Low', 'Medium', 'High'])])
print("----------------------------------------")
print(student[~student['School_Type'].isin(['Public', 'Private'])])
print("----------------------------------------")
print(student[~student['Learning_Disabilities'].isin(['Yes', 'No'])])
print("----------------------------------------")
print(student[~student['Gender'].isin(['Male', 'Female'])])
print("----------------------------------------")
print(student[~student['Peer_Influence'].isin(['Positive', 'Neutral', 'Negative'])])
print("----------------------------------------")
print(student[~student['Parental_Education_Level'].isin(['High School', 'College', 'Postgraduate'])])
print("----------------------------------------")
print(student[~student['Distance_from_Home'].isin(['Near', 'Moderate', 'Far'])])
print("----------------------------------------")

Empty DataFrame
Columns: [Hours_Studied, Attendance, Parental_Involvement, Access_to_Resources, Extracurricular_Activities, Sleep_Hours, Previous_Scores, Motivation_Level, Internet_Access, Tutoring_Sessions, Family_Income, Teacher_Quality, School_Type, Peer_Influence, Physical_Activity, Learning_Disabilities, Parental_Education_Level, Distance_from_Home, Gender, Exam_Score]
Index: []
----------------------------------------
Empty DataFrame
Columns: [Hours_Studied, Attendance, Parental_Involvement, Access_to_Resources, Extracurricular_Activities, Sleep_Hours, Previous_Scores, Motivation_Level, Internet_Access, Tutoring_Sessions, Family_Income, Teacher_Quality, School_Type, Peer_Influence, Physical_Activity, Learning_Disabilities, Parental_Education_Level, Distance_from_Home, Gender, Exam_Score]
Index: []
----------------------------------------
Empty DataFrame
Columns: [Hours_Studied, Attendance, Parental_Involvement, Access_to_Resources, Extracurricular_Activities, Sleep_Hours, Previou