# Libraries

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [25]:
import warnings
warnings.filterwarnings('ignore')

# phase 1

## Load data

In [9]:
df = pd.read_csv("data/StudentsPerformance.csv")
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


## Data basic info 

In [10]:
print(df.shape)

(1000, 8)


In [11]:
print(df.columns)

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [17]:
print(df.duplicated().value_counts())

False    1000
Name: count, dtype: int64


`no duplicated Recodes`

In [21]:
df.isnull().any()

gender                         False
race/ethnicity                 False
parental level of education    False
lunch                          False
test preparation course        False
math score                     False
reading score                  False
writing score                  False
dtype: bool

In [22]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

`no null value`

In [24]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


## phase 1 outputs
**Let's talk about the columns:**
1. gender : gender of student
2. race/ethnicity : the groups of the student
3. parental level of education : education level
4. lunch : is the food level is (free or standard)
5. test preparation course : is the student finish the course of the test or not before the test
6. math score : the math exam score 
7. reading score : the reading exam score 
8. writing  score : the writing exam score 

the columns `1 -> 5` is categorical columns, `6 -> 8` is num columns (the score of each student in subjects)  

# Phase 2 : Data Cleaning

## Rename columns

In [26]:
print(df.columns) 

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')


In [32]:
df_renamed = df.rename(columns= {'race/ethnicity': 'group' ,\
    'parental level of education' : 'education' ,\
        'test preparation course' : 'course' ,\
            'math score': 'math-score' , \
            'reading score': 'reading-score',\
                'writing score' : 'writing-score'})

In [33]:
print(df_renamed.columns) 

Index(['gender', 'group', 'education', 'lunch', 'course', 'math-score',
       'reading-score', 'writing-score'],
      dtype='object')


In [34]:
clean_df = df_renamed

In [36]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   gender         1000 non-null   object
 1   group          1000 non-null   object
 2   education      1000 non-null   object
 3   lunch          1000 non-null   object
 4   course         1000 non-null   object
 5   math-score     1000 non-null   int64 
 6   reading-score  1000 non-null   int64 
 7   writing-score  1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


## phase 2 outputs 

`clean_df` : this DF is clean and rename some columns to make more friendly and sure the num columns have int type.  