# Libraries

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [25]:
import warnings
warnings.filterwarnings('ignore')

# phase 1

## Load data

In [9]:
df = pd.read_csv("data/StudentsPerformance.csv")
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


## Data basic info 

In [10]:
print(df.shape)

(1000, 8)


In [11]:
print(df.columns)

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [17]:
print(df.duplicated().value_counts())

False    1000
Name: count, dtype: int64


`no duplicated Recodes`

In [21]:
df.isnull().any()

gender                         False
race/ethnicity                 False
parental level of education    False
lunch                          False
test preparation course        False
math score                     False
reading score                  False
writing score                  False
dtype: bool

In [22]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

`no null value`

In [24]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


## phase 1 outputs
**Let's talk about the columns:**
1. gender : gender of student
2. race/ethnicity : the groups of the student
3. parental level of education : education level
4. lunch : is the food level is (free or standard)
5. test preparation course : is the student finish the course of the test or not before the test
6. math score : the math exam score 
7. reading score : the reading exam score 
8. writing  score : the writing exam score 

the columns `1 -> 5` is categorical columns, `6 -> 8` is num columns (the score of each student in subjects)  

# Phase 2 : Data Cleaning

## Rename columns

In [26]:
print(df.columns) 

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')


In [114]:
df_renamed = df.rename(columns= {'race/ethnicity': 'group' ,\
    'parental level of education' : 'parent_education' ,\
        'test preparation course' : 'course' ,\
            'math score': 'math_score' , \
            'reading score': 'reading_score',\
                'writing score' : 'writing_score'})

In [115]:
print(df_renamed.columns) 

Index(['gender', 'group', 'parent_education', 'lunch', 'course', 'math_score',
       'reading_score', 'writing_score'],
      dtype='object')


In [116]:
clean_df = df_renamed

In [117]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   gender            1000 non-null   object
 1   group             1000 non-null   object
 2   parent_education  1000 non-null   object
 3   lunch             1000 non-null   object
 4   course            1000 non-null   object
 5   math_score        1000 non-null   int64 
 6   reading_score     1000 non-null   int64 
 7   writing_score     1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


## phase 2 outputs 

`clean_df` : this DF is clean and rename some columns to make more friendly and sure the num columns have int type.  

# phase 3 : Descriptive Statistics

In [118]:
clean_df.describe()

Unnamed: 0,math_score,reading_score,writing_score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [122]:
print (f" math score median = {clean_df['math_score'].median()}")
print (f" reading score median = {clean_df['reading_score'].median()}")
print (f" writing score median = {clean_df['writing_score'].median()}")

 math score median = 66.0
 reading score median = 70.0
 writing score median = 69.0


In [None]:
grouped_df = clean_df.groupby(by= 'gender')

In [None]:
result = grouped_df[['math-score','reading-score','writing-score']].agg(['mean' , 'median' , 'std' ,'min' , 'max']).round(3).T

# Flatten the multi-index row labels
result.index = pd.Index([f"{col[0].replace('-score', '__')}_{col[1]}" for col in result.index])

display(result)

gender,female,male
math___mean,63.633,68.728
math___median,65.0,69.0
math___std,15.491,14.356
math___min,0.0,27.0
math___max,100.0,100.0
reading___mean,72.608,65.473
reading___median,73.0,66.0
reading___std,14.378,13.932
reading___min,17.0,23.0
reading___max,100.0,100.0


`Here we see the female has better mean in tow subject reading and writing but lose in the math for males `

In [120]:
clean_df.parent_education.value_counts()

parent_education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

`Here we see the most common parent education is 'some college'`