# Analyzing how some factors affect students grades 

# Hypothesis


1. Students from private school have better tests grades than students from public school - **TRUE**
2. Students who have have internet access at home have better tests grades - **TRUE**
3. Male students have better tests grades than female students - **TRUE**
4. Students living in Vitória(capital) have better tests grades than students living in other cities - **TRUE**
5. White students have better tests grades that students of other races - **TRUE**
6. Richer students have better tests grades than poorer students - **TRUE**
7. Students whose parents have higher level of education have better tests grades - **TRUE**
8. More people living at the same house produce worse grades - **FALSE**

OBS:

8 - Seems that "traditional families"(3 to 4 people) produce better grades  


# 1.0 Imports

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 2.0 Data Selection - Espírito Santo

In [None]:
df_raw = pd.read_csv("/kaggle/input/microdados-enem-2019-por-estado/Selec_ES.txt", sep = ";" )
df_raw.head()

# 2.1 Data Selection - Features of interest

In [None]:
features= ["ID","TP_SEXO","NU_IDADE", "TP_COR_RACA", "NO_MUNICIPIO_RESIDENCIA", "TP_ESCOLA", "NU_NOTA_CN","NU_NOTA_CH","NU_NOTA_LC","NU_NOTA_MT","NU_NOTA_REDACAO", "TP_LINGUA",
           "Q001","Q002","Q005","Q006", "Q025 "]
df = df_raw[features]
df = df.rename(columns={'Q001': 'EDU_MOM', "Q002":"EDU_DAD","Q005":"POP_RES","Q006":"FAM_WEALTH","Q025 ":"INTERNET"})
df.head()

# 3.0 Data Cleaning/ Preparation

I'll be dropping all the na values because I want to analyse that students who did all the four tests

In [None]:
df = df.dropna()
df["TESTS_GRADE"] = (df["NU_NOTA_CN"] + df["NU_NOTA_CH"] + df["NU_NOTA_LC"] + df["NU_NOTA_MT"])
df.head()


# 4.0 Exploring - Gender 

In [None]:
plt.figure(figsize = (14,6))
gender_count = df.groupby("TP_SEXO").describe()["ID"]["count"].reset_index()
sns.barplot(data = gender_count, x = "count", y = "TP_SEXO", orient = "h",);



# 4.1 Exploring - Age 

In [None]:
plt.figure(figsize = (14,6))
f = sns.distplot(df.NU_IDADE)
df.NU_IDADE.describe()[["min","50%","max"]]




# Hypothesis 1.0

In [None]:
mean_grade_by_school = df.groupby("TP_ESCOLA").describe()["TESTS_GRADE"]["mean"]
mean_grade_by_school = mean_grade_by_school.iloc[[1,2]]
mean_grade_by_school = mean_grade_by_school.rename(index = {2:"Public", 3:"Private"}).reset_index()
mean_grade_by_school
plt.figure(figsize = (14,6))
sns.barplot(data = mean_grade_by_school, x = "mean", y = "TP_ESCOLA", orient = "h",);

In [None]:
plt.figure(figsize = (14,6))
public_grades = df[df["TP_ESCOLA"] == 2]["TESTS_GRADE"]
private_grades = df[df["TP_ESCOLA"] == 3]["TESTS_GRADE"]
sns.distplot(public_grades, label = "Public");
sns.distplot(private_grades, label = "Private");
plt.legend();

# Hypothesis 2.0

In [None]:
plt.figure(figsize = (14,6))
mean_internet_grades = df.groupby("INTERNET").mean()["TESTS_GRADE"].reset_index()
mean_internet_grades["INTERNET"] = ["NO","YES"]
sns.barplot(data = mean_internet_grades, x ="TESTS_GRADE" , y = "INTERNET", orient = "h" );

In [None]:
plt.figure(figsize = (14,6))
no_internet_grades = df[df["INTERNET"] == "A "]["TESTS_GRADE"]
yes_internet_grades = df[df["INTERNET"] == "B "]["TESTS_GRADE"]
sns.distplot(no_internet_grades, label = "NO INTERNET");
sns.distplot(yes_internet_grades, label = "INTERNET");
plt.legend();


# Hypothesis 3.0

In [None]:
plt.figure(figsize = (14,6))
mean_grade_by_gender = df.groupby("TP_SEXO").describe()["TESTS_GRADE"]["mean"].reset_index()
sns.barplot(data = mean_grade_by_gender, x = "mean", y = "TP_SEXO", orient = "h",);

In [None]:
plt.figure(figsize = (14,6))
female_grades = df[df["TP_SEXO"] == "F"]["TESTS_GRADE"]
male_grades = df[df["TP_SEXO"] == "M"]["TESTS_GRADE"]
sns.distplot(female_grades, label = "Female");
sns.distplot(male_grades, label = "Male");
plt.legend();

# Hypothesis 4.0

In [None]:
plt.figure(figsize = (20,30))
mean_grade_by_city = df.groupby("NO_MUNICIPIO_RESIDENCIA").mean()["TESTS_GRADE"].reset_index().sort_values(by="TESTS_GRADE", ascending = False)
sns.barplot(data = mean_grade_by_city, x = "TESTS_GRADE", y = "NO_MUNICIPIO_RESIDENCIA", orient = "h");

# Hyphothesis 5.0

In [None]:
plt.figure(figsize = (14,6))
mean_grade_by_race = df.groupby("TP_COR_RACA").mean()["TESTS_GRADE"]
mean_grade_by_race = mean_grade_by_race.rename(index = {0:"Not declared", 1:"White", 2:"Black",3:"Brown" , 4:"Yellow", 5:"Indigene"}).reset_index()
mean_grade_by_race = mean_grade_by_race[1:].sort_values(by = "TESTS_GRADE", ascending = False)
sns.barplot(data = mean_grade_by_race,x = "TESTS_GRADE" , y = "TP_COR_RACA", orient = "h");

# Hypothesis 6.0

In [None]:
plt.figure(figsize = (20,10))
mean_graded_by_wealth = df.groupby("FAM_WEALTH").mean()["TESTS_GRADE"].reset_index()
sns.barplot(data = mean_graded_by_wealth, x = "TESTS_GRADE", y = "FAM_WEALTH", orient = "h");

# Hypothesis 7.0

In [None]:
plt.figure(figsize = (14,6))
mean_grade_by_mom_education = df.groupby("EDU_MOM").mean()["TESTS_GRADE"][0:-1].reset_index()
sns.barplot(data = mean_grade_by_mom_education, x = "TESTS_GRADE" , y = "EDU_MOM", orient = "h");

In [None]:
plt.figure(figsize = (14,6))
mean_grade_by_dad_education = df.groupby("EDU_DAD").mean()["TESTS_GRADE"][0:-1].reset_index()
sns.barplot(data = mean_grade_by_dad_education, x = "TESTS_GRADE" , y = "EDU_DAD", orient = "h");

# Hypothesis 8.0

In [None]:
plt.figure(figsize = (16,8))
dist_grade_pop_res = df.groupby("POP_RES").describe()["ID"]["count"]
mean_grade_by_pop_res = df.groupby("POP_RES").mean()["TESTS_GRADE"].reset_index().iloc[0:7]
sns.barplot(data = mean_grade_by_pop_res, x = "TESTS_GRADE", y = "POP_RES", orient = "h");
dist_grade_pop_res