# Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
import matplotlib.pyplot as plt # data visualization

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Acquire

In [None]:
# Reading the dataset
path = "/kaggle/input/students-performance-in-exams/StudentsPerformance.csv"
df = pd.read_csv(path)
df.head()

In [None]:
# Number of observations and variables
print(f"There are {df.shape[0]} observations and {df.shape[1]} variables in the dataset.")

In [None]:
# Information about the dataset
df.info()

In [None]:
# There are no duplicate values
df.duplicated().sum()

In [None]:
# There are no missing values
df.isna().any()

* **CATEGORICAL VARIABLES** (object)
    * **gender** : female/male
    * **race/ethnicity** : group A/B/C/D/E
    * **parental level of education**
    * **lunch** : standard - free/reduced
    * **test preparation course** : none / completed
* **NUMERICAL VARIABLES** (int)
    * **math score** 
    * **reading score** 
    * **writing score** 

# Prepare

In [None]:
# Upper column names and separate them with "_"
df.columns = [column.upper().replace(" ", "_") for column in df.columns]
df.head()

In [None]:
# Object to category
for column in df.loc[:, "GENDER":"TEST_PREPARATION_COURSE"].columns:
    df[column] = df[column].astype("category")
    
# Parental level of education: ordered -> True
df.PARENTAL_LEVEL_OF_EDUCATION = pd.Categorical(df.PARENTAL_LEVEL_OF_EDUCATION, categories=['high school', 'some high school', "some college", "associate's degree", "bachelor's degree", "master's degree"], 
                                                    ordered=True)

# Checking with assert
assert df.PARENTAL_LEVEL_OF_EDUCATION.dtype == "category"

In [None]:
# Creating the "TOTAL_SCORE" column
df["TOTAL_SCORE"] = round((df.MATH_SCORE + df.READING_SCORE + df.WRITING_SCORE) / 3, 2)
df.head()

In [None]:
# Creating a bell curve
lg_scores = [0, 44, 52, 60, 68, 76, 84, 92, np.inf]    
lg_labels = ["FF", "FD", "DD", "CC", "CB", "BB", "BA", "AA"]
df["LETTER_GRADE"] = pd.cut(df["TOTAL_SCORE"], bins = lg_scores, labels = lg_labels)

In [None]:
# Getting data into title format
for column in df.loc[:, "GENDER":"TEST_PREPARATION_COURSE"].columns:
    df[column] = [cat.title() for cat in df[column]]
df.head()

# Explore

In [None]:
# Summary statistics of numeric variables
df.describe()

* Score average between 66-70 for all tests.
* The mean and median are close to each other.
* There is no student who gets 0 except math.

In [None]:
# Summaries of categorical variables
df.describe(include = "object")

* The number of **female** students is higher than the number of **male** students.
* There are students in **group C** at most.
* The highest level of education for parents is **some college**.
* Students usually pay the **standard** rate for lunch.
* The number of students who attend the **test preparation course** is higher than the number of students who do not.

In [None]:
df.groupby("GENDER").mean()

* **Male** are better at **math** than **female**, while female do better at **reading**, **writing** and on **total**. 

In [None]:
df.pivot_table(values = "TOTAL_SCORE", 
               index = ["PARENTAL_LEVEL_OF_EDUCATION", "TEST_PREPARATION_COURSE"],
               aggfunc = [np.median, np. mean],
               margins = True)

## Distributions of Scores

In [None]:
sns.set_palette("dark")
sns.set_style("whitegrid")
fig, ax = plt.subplots(nrows = 1, ncols = 3, sharey = True, figsize = (16,9))
ax[1].set_title("Distribuiton of Scores", y = 1.03)
sns.histplot(df["MATH_SCORE"], ax = ax[0], kde = True)
sns.histplot(df["READING_SCORE"], ax = ax[1], kde = True)
sns.histplot(df["WRITING_SCORE"], ax = ax[2], kde = True)
plt.show()

In [None]:
g = sns.displot(df["TOTAL_SCORE"], kde=True, rug = True)
g.fig.suptitle("Distribution of Total Score", y = 1.03)
plt.show()

## Relationships Between Scores

In [None]:
g = sns.heatmap(df.corr())
g.set_title("Correlation Heatmap",y = 1.03)
plt.show()

In [None]:
fig, (ax0, ax1, ax2) = plt.subplots(1, 3, sharey = True, figsize = (16, 9))

sns.set_palette("dark")
# MATH_SCORE - READING_SCORE
sns.regplot(x = "MATH_SCORE", y = "READING_SCORE", data = df, ax = ax0)

# MATH_SCORE - WRITING_SCORE
sns.regplot(x = "MATH_SCORE", y = "WRITING_SCORE", data = df, ax = ax1)
ax1.set_title("Regression Plots of Scores",y = 1.03)

# READING_SCORE - WRITING_SCORE
sns.regplot(x = "MATH_SCORE", y = "READING_SCORE", data = df, ax = ax2)
plt.show()

### Scores - Test Preparation Course

In [None]:
fig, (ax0, ax1, ax2) = plt.subplots(1, 3, figsize = (16, 9))
sns.set_palette(["red", "green"])
sns.stripplot(y = "MATH_SCORE", x = "TEST_PREPARATION_COURSE", data = df, ax = ax0)
ax1.set_title("Strip Plots of Scores - Test Preparation Course",y = 1.03)
sns.stripplot(y = "READING_SCORE", x = "TEST_PREPARATION_COURSE", data = df, ax = ax1)
sns.stripplot(y = "WRITING_SCORE", x = "TEST_PREPARATION_COURSE", data = df, ax = ax2)
plt.show()

## Number of Categories of Variables That May Affect Grades

### Gender

In [None]:
# Counts of GENDER variable
print(df["GENDER"].value_counts())
g = sns.countplot(x = "GENDER", data = df, palette = {"Female":"pink", "Male":"cyan"})
g.set_title("Counts of Gender",y = 1.02)
plt.show()

### Race/Ethnicity

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
custom_palette = ["r", "g", "orange", "b", "y", "c"]
sns.set_palette(custom_palette)
print(df["RACE/ETHNICITY"].value_counts())
g = sns.countplot(x = "RACE/ETHNICITY", data = df.sort_values(by = "RACE/ETHNICITY"))
g.set_title("Counts of Race/Ethnicity",y = 1.02)
plt.show()

### Parental Level of Education

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
sns.set_palette("Blues")
print(df["PARENTAL_LEVEL_OF_EDUCATION"].value_counts())
g = sns.countplot(x = "PARENTAL_LEVEL_OF_EDUCATION", 
              data = df.sort_values(by = "PARENTAL_LEVEL_OF_EDUCATION"))
g.set_title("Counts of Parental Education Level", y = 1.02)
g.set(xlabel = "Education Level", ylabel = "Count")
plt.xticks(rotation = 90)
plt.show()

### Lunch

In [None]:
sns.set_palette(["green", "red"])
print(df["LUNCH"].value_counts())
g = sns.countplot(x = "LUNCH", 
              data = df.sort_values(by = "LUNCH"))
g.set_title("Lunch",y = 1.02)
g.set(xlabel = "Lunch", ylabel = "Count")
plt.show()

### Test Preparation Course

In [None]:
sns.set_palette(["black", "gray"])
print(df["TEST_PREPARATION_COURSE"].value_counts())
g = sns.countplot(x = "TEST_PREPARATION_COURSE", 
              data = df.sort_values(by = "TEST_PREPARATION_COURSE"))
g.set_title("Test Preparation Course", y = 1.02)
g.set(xlabel = "Test Preparation Course", ylabel = "Count")
plt.show()

## Factors That Can Affect Scores

### Parental Level of Education

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
sns.set_palette("Blues")
order = ['High School', 'Some High School', "Some College", "Associate'S Degree", "Bachelor'S Degree", "Master'S Degree"]
g = sns.boxplot(x = "TOTAL_SCORE", y = "PARENTAL_LEVEL_OF_EDUCATION", data = df, order = order)
g.set_title("Basic Statistics of Parental Level of Education",y = 1.02)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
sns.set_palette("Blues")
g = sns.barplot(x = "TOTAL_SCORE", y = "PARENTAL_LEVEL_OF_EDUCATION", data = df, order = order)
g.set_title("Basic Statistics of Parental Level of Education",y = 1.02)
plt.show()

### Parental Level of Education - Gender

In [None]:
fig,ax = plt.subplots(figsize = (16, 9))
sns.set_palette(["pink","cyan"])
g = sns.barplot(y = "TOTAL_SCORE", 
            x = "PARENTAL_LEVEL_OF_EDUCATION", 
            hue = "GENDER",
            data = df, order = order)
g.set_title("Basic Statistics of Parental Level of Education - Gender",y = 1.02)
plt.show()

### Parentel Level of Education - Test Preparation Course

In [None]:
fig,ax = plt.subplots(figsize = (16, 9))
sns.set_palette("RdBu")
g = sns.barplot(y = "TOTAL_SCORE", 
            x = "PARENTAL_LEVEL_OF_EDUCATION", 
            hue = "TEST_PREPARATION_COURSE",
            data = df, order = order)
g.set_title("Basic Statistics of Parental Level of Education - Test Preparation Course",y = 1.02)
plt.show()

### Race/Ethnicity - Gender

In [None]:
sns.set_style("whitegrid")
fig,ax = plt.subplots(figsize = (8, 6))
sns.set_palette(["pink","cyan"])
g = sns.pointplot(x = "RACE/ETHNICITY", y = "TOTAL_SCORE", data = df, hue = "GENDER")
g.set_title("Basic Statistics of Race/Ethnicity - Gender",y = 1.02)
plt.show()

### Lunch - Race/Ethnicity

In [None]:
fig,ax = plt.subplots(figsize = (14, 7))
sns.set_palette("GnBu")
g = sns.barplot(y = "TOTAL_SCORE", x = "LUNCH", data = df, hue = "RACE/ETHNICITY")
g.set_title("Basic Statistics of Lunch - Race/Ethnicity",y = 1.02)
plt.show()

### Race/Ethnicity

In [None]:
sns.set_palette("GnBu")
order2 = ['Group A', 'Group B', "Group C", "Group D", "Group E"]
g = sns.violinplot(x = "TOTAL_SCORE", y = "RACE/ETHNICITY", data = df, order = order2)
g.set_title("Basic Statistics of Race/Ethnicity",y = 1.02)
plt.show()

### Letter Grade: AA

In [None]:
sns.set_palette(["pink", "cyan"])
print(df[df["LETTER_GRADE"] == "AA"]["GENDER"].value_counts(normalize = True))
g = sns.countplot(x = "GENDER", data = df[df["LETTER_GRADE"] == "AA"])
g.set_title("Counts of Letter Grade (AA) - Gender",y = 1.02)
plt.show()

### Letter Grade: FF

In [None]:
sns.set_palette(["cyan", "pink"])
print(df[df["LETTER_GRADE"] == "FF"]["GENDER"].value_counts())
g = sns.countplot(x = "GENDER", data = df[df["LETTER_GRADE"] == "FF"])
g.set_title("Counts of Letter Grade (FF) - Gender",y = 1.02)
plt.show()

### Scores - Gender / Scatter - KDE

In [None]:
sns.pairplot(df, vars = ["READING_SCORE", "WRITING_SCORE", "MATH_SCORE"], 
             hue = "GENDER",
             palette = "husl",
             plot_kws = {"alpha": 0.5})
plt.show()

# Result
* The successful ranking of the students between the tests with very few differences;  **READING_SCORE** > **WRITING_SCORE** > **MATH_SCORE**
* Students usually do the same in one test as they do in the other. 
* Female are more successful than male; **Female** > **Male**
* There is also a ranking of success among the groups;  **E** > **D** > **C** > **B** > **A**
* Students' notes also vary according to the education level of their parents. The higher the parent's education level, the higher the student's chance of success. 
* Considering the results of the student's financial situation according to the fees paid for lunch;  **Standard** > **Free/Reduced**
* Students who attend the test preparation course are usually one step ahead of other students;  **Completed** > **None**

In [None]:
# 20 students with the highest average 
df.sort_values("TOTAL_SCORE", ascending = False).head(20)