In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This dataset contains reading, writing, and math test scores from 1000 fictional students. I have provided visualizations for what could be several contributing factors to students' exam scores.

In [None]:
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt

Notice discrepencies between males in females in reading and math scores; males score higher in math while females score higher in reading.

In [None]:
males = df.loc[df.gender == 'male']
females = df.loc[df.gender == 'female']
plt.scatter(males['reading score'], males['math score'], color = 'blue')
plt.scatter(females['reading score'], females['math score'], color = 'pink')
plt.xlabel('Reading Score')
plt.ylabel('Math Score')
plt.legend(['Males', 'Females'])
plt.title('Reading & Math Scores - Male vs Female');


Notice reading and writing scores are highly correlated - therefore I only compare math and reading scores for the rest of my analysis because comparing writing scores as well seems redundant.

In [None]:
plt.scatter(males['reading score'], males['writing score'], color = 'blue')
plt.scatter(females['reading score'], females['writing score'], color = 'pink')
plt.xlabel('Reading Score')
plt.ylabel('Writing Score')
plt.legend(['Males', 'Females'])
plt.title('Reading & Writing Scores - Male vs Female');

The correlation matrix shows a high correlation between reading and writing scores and a moderate correlation between math and reading/writing scores.

In [None]:
df[['reading score', 'writing score', 'math score']].corr()

Visually, students who complete a test preparation course appear to have a slight advantage in test scores.

In [None]:
course_taken = df.loc[df['test preparation course'] == 'completed']
course_not_taken = df.loc[df['test preparation course'] == 'none']

plt.scatter(course_not_taken['reading score'], course_not_taken['math score'])
plt.scatter(course_taken['reading score'], course_taken['math score'])
plt.xlabel('Reading Score')
plt.ylabel('Math Score')
plt.legend(['Course not Completed', 'Course Completed'])
plt.title('Reading & Math Scores - Course Completed vs Course not Completed');

Students who take the test preparation course have higher average test scores. It is important however to realize correlation does not causation. It is possible that taking the test preparation course did not lead to improved average test scores, but rather those students that took the test preparation course were already more motivated to succeed and would have scored higher regardless if they participated in the test preparation course. Determining causation requires more analysis and data.

In [None]:
df.groupby('test preparation course').mean()

Students who recieve free/reduced lunches perform worse than those who get the standard lunch. The reasoning behind this finding is that the lunch variable is a proxy for the student's socioeconomic status. Students who recieve free/reduced lunches possibly come from a poorer family, and family wealth is a determining factor in a student's academic success.

In [None]:
lunch = df.loc[df['lunch'] == 'standard']
free_lunch = df.loc[df['lunch'] == 'free/reduced']

plt.scatter(lunch['reading score'], lunch['math score'])
plt.scatter(free_lunch['reading score'], free_lunch['math score'])
plt.xlabel('Reading Score')
plt.ylabel('Math Score')
plt.legend(['Standard Lunch', 'Free/Reduced Lunch'])
plt.title('Reading & Math Scores - Standard Lunch vs Free/Reduced Lunch');

In [None]:
df.groupby('lunch').mean()

An attempt to visualize the effect of parental level of education results in a messy scatter plot, so I will try to visualize some other effects now using density plots.

In [None]:
education_levels = [i for i in df['parental level of education'].unique()]
for el in education_levels:
    dfslice = df.loc[df['parental level of education'] == el]
    plt.scatter(dfslice['reading score'], dfslice['math score'])
plt.legend(education_levels)
plt.xlabel('Reading Score')
plt.ylabel('Math Score')
plt.title('Reading & Math Scores - Parental Level of Education');

In [None]:
import seaborn as sns

A visualization of test scores shows a strong advantage for students whos parents obtained masters degrees, but the other effects are not obvious because the plot is still crowded. An analysis of the average test scores for students of these groups shows that students whos parents attended at least some college have an advantage over those students whos parents never attended college. Although this finding is not obvious from the visualization, it appears from the fat tails that these results stems from underperforming students who skew their density curve to the left.

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 5))
for el in education_levels:
    dfslice = df.loc[df['parental level of education']==el]
    sns.kdeplot(dfslice['math score'],fill=True,ax = ax1, label = el)
    sns.kdeplot(dfslice['reading score'],fill=True,ax = ax2, label = el)
ax1.legend(loc = 'upper left');
ax2.legend(loc = 'upper left');
ax1.set(xlabel='Math Score', ylabel='Density', title = 'Density Curve Estimation of Parental Education levels effect on Math Scores');
ax2.set(xlabel='Reading Score', ylabel='Density', title = 'Density Curve Estimation of Parental Education levels effect on Reading Scores');

In [None]:
df.groupby('parental level of education').mean()

A visualization of race shows a clear distinction of groups D and E performing better on math scores, while the effect is visually less noticeable for reading scores. I included the writing scores plot because it shows again groups D and E outperforming the other groups with a more condensed distribution around a higher score.

In [None]:
races = [i for i in df['race/ethnicity'].unique()]
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (25, 5))
for r in races:
    dfslice = df.loc[df['race/ethnicity']==r]
    sns.kdeplot(dfslice['math score'],fill=True,ax = ax1, label = r)
    sns.kdeplot(dfslice['reading score'],fill=True,ax = ax2, label = r)
    sns.kdeplot(dfslice['writing score'],fill=True,ax = ax3, label = r)
ax1.legend(loc = 'upper left');
ax2.legend(loc = 'upper left');
ax3.legend(loc = 'upper left');
ax1.set(xlabel='Math Score', ylabel='Density', title = 'Density Curve Estimation of Race/Ethnicity effect on Math Scores');
ax2.set(xlabel='Reading Score', ylabel='Density', title = 'Density Curve Estimation of Race/Ethnicity effect on Reading Scores');
ax3.set(xlabel='Writing Score', ylabel='Density', title = 'Density Curve Estimation of Race/Ethnicity effect on Writing Scores');

In [None]:
df.groupby('race/ethnicity').mean()