In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
file_path = "/kaggle/input/students-performance-in-exams/StudentsPerformance.csv"
students_perf_data = pd.read_csv(file_path)

# Let's see what we are dealing with
students_perf_data.head(5)

In [None]:
# Let's verify the amount of missing data inside
print(students_perf_data.size)
print(students_perf_data.isnull().size)
# it seems we have no missing data which is amazing

In [None]:
# it seems that people that people with the preparation course finished have 
# in general higher scores
# but the difference is felt more in the writing category and less in the math one

df_test_preparation = students_perf_data.groupby(["test preparation course"])[["math score", "reading score", "writing score"]].agg(["mean", min, max])
print(df_test_preparation)

sns.displot(data=students_perf_data, x="math score" ,hue="test preparation course", kde=False)
plt.show()

sns.displot(data=students_perf_data, x="reading score" ,hue="test preparation course", kde=False)
plt.show()

sns.displot(data=students_perf_data, x="writing score" ,hue="test preparation course", kde=False)
plt.show()


In [None]:
# let's see if the parents education levels have any influence in the scores of their children
# also i grouped them by the amount of time needed to get thei education
# for example some high school< high school < some college
df_parents_data = students_perf_data.groupby("parental level of education")[["math score", "reading score", "writing score"]].mean()
df_parents_data = df_parents_data.reindex(["some high school" ,"high school", "some college", "associate's degree", "bachelor's degree", "master's degree"])
print(df_parents_data.head(10))
# it seems that the more education parents have the higher the scores
# though the effect doesn't seem that big

sns.heatmap(data=df_parents_data, annot=True, linewidth=5)

In [None]:
# Let's see if the education of the parents has some kind of influence on their child
df_parents_course = students_perf_data.groupby(["parental level of education", "test preparation course"])["gender"].count()
df_parents_course_aux = students_perf_data.groupby(["parental level of education"])["test preparation course"].count()

# let's see their percentages
print((df_parents_course / df_parents_course_aux) * 100)
# there is no connection between parental education level and the course completion of the child

In [None]:
# let's see if gender has any link with the scores
gender_score_data = students_perf_data.groupby("gender")[["math score", "reading score", "writing score"]].mean()

sns.barplot(data=students_perf_data, x="gender", y="math score")
plt.show()

sns.barplot(data=students_perf_data, x="gender", y="reading score")
plt.show()

sns.barplot(data=students_perf_data, x="gender", y="writing score")
plt.show()

In [None]:
# Let's see if race has any influence
race_data = students_perf_data.groupby("race/ethnicity")[["math score", "writing score", "reading score"]].mean()
sns.heatmap(data=race_data, annot=True, linewidth=10)
# it seems that people from group E have a lot higher grades than the other
# while people from group A have smaller numbers

In [None]:
# let's see what does their group say about their parents education level
ethnicity = students_perf_data.groupby(["race/ethnicity", "parental level of education"])["gender"].count()
print(ethnicity)


In [None]:
# finally let's see if there si any correlation between the scores
# for example if a person that is good at math is also good at writing

scores_df = students_perf_data[["math score", "reading score", "writing score"]]
corelated_scores = scores_df.corr(method="pearson")
mask = np.triu(np.ones_like(corelated_scores,dtype = bool))

sns.heatmap(corelated_scores, mask=mask[0:5, 0:5], linewidth=10, cmap="autumn")
# scores for reading and writing are corelated stronger than any of them with the maths score

