In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

students_df = pd.read_csv('../input/StudentsPerformance.csv')
students_df.head()

# Basic Metadata checks

#### Now lets check the dtypes of the columns.

In [None]:
students_df.dtypes

Some columns e.g. gender, race/ethnicity etc. appear to have dtype=object, whereas they are categories. Lets change the dtype to category. And rename the categorical columns as well, for differentiating them later on.

In [None]:
categorical_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
students_df[categorical_columns] = students_df[categorical_columns].astype('category')

new_column_names = [column + '(categorical)' for column in categorical_columns]
col_name_pairs = dict(zip(categorical_columns, new_column_names))

students_df.rename(columns=col_name_pairs, inplace=True)

students_df.dtypes

# Data Wrangling/Cleaning:

#### Now lets check for the unique values of each column; and null counts as well.

In [None]:
for column in students_df.columns:
    print((column + ':    {}\n').format(students_df[column].unique()))

In [None]:
students_df.isnull().sum()

#### So far, so good. No data imputation/wrangling/cleaning needed.

#### Before we go on and plot some visuals, lets define some helper functions that'll come in handy.

In [None]:
#  draw's the y-axis values of bars of the bar_plot passed to it 
def draw_freq_on_bars(bar_plot, category_freqs):
    for barIndex, freq in enumerate(category_freqs):
        bar_plot.text(x=barIndex, y=freq - (0.16 * bar_plot.get_ylim()[1]), s=str(freq) + '\n(' + str(round((freq / category_freqs.sum()) * 100)) + '%)', color='white', horizontalAlignment='center', fontsize=15)
        
        
#  formats xtickLabels of the plot passed to it so that they don't overlap with each other when shown.
def format_xtickLabels(plot, df, x, show_count=False):
    xtickLabels = plot.get_xticklabels()
    if(show_count):
        for count, xtickLabel in zip(df[x].value_counts(), xtickLabels):
            xtickLabel.set_text(xtickLabel.get_text().replace(' ', '\n') + '\n(n=' + str(count) + ')')
    else:
        for xtickLabel in xtickLabels:
            xtickLabel.set_text(xtickLabel.get_text().replace(' ', '\n'))
    plot.set_xticklabels(xtickLabels)

# EDA

In [None]:
#  changing color palette of Seaborn
sns.set(palette='tab10')

#### Now, lets have a look at the strength of students by gender, race/ethnicity, parental level of education, lunch, and test preparation course.

In [None]:
plt.figure()
category_freqs = students_df['gender(categorical)'].value_counts()
bar_plot = sns.barplot(x=category_freqs.index.get_values(), y=category_freqs)

bar_plot.set_ylabel('count (by gender)')

draw_freq_on_bars(bar_plot, category_freqs)

In [None]:
plt.figure(figsize=(7,4))
category_freqs = students_df['race/ethnicity(categorical)'].value_counts()
bar_plot = sns.barplot(x=category_freqs.index.get_values(), y=category_freqs)

bar_plot.set_ylabel('count (by race/ethnicity)')

draw_freq_on_bars(bar_plot, category_freqs)

In [None]:
plt.figure(figsize=(9,4))
category_freqs = students_df['parental level of education(categorical)'].value_counts()
bar_plot = sns.barplot(x=category_freqs.index.get_values(), y=category_freqs)

bar_plot.set_ylabel('count (by parental level of education)')

format_xtickLabels(bar_plot, students_df, 'parental level of education(categorical)')

draw_freq_on_bars(bar_plot, category_freqs)

In [None]:
plt.figure()
category_freqs = students_df['lunch(categorical)'].value_counts()
bar_plot = sns.barplot(x=category_freqs.index.get_values(), y=category_freqs)

bar_plot.set_ylabel('count (by lunch)')

draw_freq_on_bars(bar_plot, category_freqs)

In [None]:
plt.figure()
category_freqs = students_df['test preparation course(categorical)'].value_counts()
bar_plot = sns.barplot(x=category_freqs.index.get_values(), y=category_freqs)

bar_plot.set_ylabel('count (by test preparation course)')

draw_freq_on_bars(bar_plot, category_freqs)

#### Now, lets have a look at the overall distribution of the scores.

In [None]:
sns.pairplot(students_df, diag_kws={'bins':20, 'ec':'white'})

Points to be noted:
<ul>
    <li>By looking at the distribution of the scores, shown in the diagonal, it can easily be seen that quite the bunch scored above 50 (as most part of the histogram is after 50), though the bunch is somewhat smaller in case of math score.
    </li>
    <li>The scatter plots show that the scores have a strong linear relationship with each other; and have a strong positive correlation as well, especially reading and writing scores.
    </li>
</ul>

In [None]:
#  draw's medians of scores(column y), for each group/category in column x of DataFrame df,
#  on the passed violin_plot
def draw_median_on_violinplot(violin_plot, df, x, y):
    medians = df.groupby([x])[y].median().values
    for violinIndex, median in enumerate(medians):
        violin_plot.text(violinIndex + 0.06, median - 2, str(median), color='black', fontsize='small')

In [None]:
#  this'll be handy in iterating over violin-plots of each scores column
score_columns = ['math score', 'reading score', 'writing score']

#### Now, lets have a closer look at the scores, of each categorical column.

## By gender(categorical):

In [None]:
sns.pairplot(students_df, hue='gender(categorical)', diag_kind='kde')

The scatter plots show that the **scores of both male and female students are overlapping a good deal.**

In [None]:
plt.figure(figsize=(12, 4))
plt.subplots_adjust(wspace=1, bottom=0.2)

count_of_subplots = len(score_columns)
for i, column in enumerate(score_columns):
    
    plt.subplot(1, count_of_subplots, i + 1)
    
    violin_plot = sns.violinplot(data=students_df, x='gender(categorical)', y=column)

    format_xtickLabels(violin_plot, students_df, 'gender(categorical)', True)

    draw_median_on_violinplot(violin_plot=violin_plot, df=students_df, x='gender(categorical)', y=column)

Female students seem to be in the lead as the bulk of their violin plots lie on an upper-part of the y-axis as compared to male students; except for maths, male students are in lead there even though they are outnumbered by the females.

## By race/ethnicity(categorical):

In [None]:
sns.pairplot(students_df, hue='race/ethnicity(categorical)', diag_kind='kde')

Yet again, we see quite an overlap between different categories/classes in the scatter plots.

In [None]:
plt.figure(figsize=(20, 5))
plt.subplots_adjust(wspace=0.3, bottom=0.2)

count_of_subplots = len(score_columns)
for i, column in enumerate(score_columns):
    
    plt.subplot(1, count_of_subplots, i + 1)
    
    violin_plot = sns.violinplot(data=students_df, x='race/ethnicity(categorical)', y=column)
    
    format_xtickLabels(violin_plot, students_df, 'race/ethnicity(categorical)', True)

    draw_median_on_violinplot(violin_plot=violin_plot, df=students_df, x='race/ethnicity(categorical)', y=column)

Scores of all categories/classes are distributed fairly evenly, but group E is somewhat in the lead in math and reading scores, but group D is in lead in writing score.

## By parental level of education(categorical):

In [None]:
sns.pairplot(students_df, hue='parental level of education(categorical)', diag_kind='kde')

Different categories in view, but same situation as before. Students with different parental level of education have quite an overlap in their scores as seen in the scatter plots.

In [None]:
plt.figure(figsize=(25, 5))
plt.subplots_adjust(wspace=0.3, bottom=0.2)

count_of_subplots = len(score_columns)
for i, column in enumerate(score_columns):
    
    plt.subplot(1, count_of_subplots, i + 1)
    
    violin_plot = sns.violinplot(data=students_df, x='parental level of education(categorical)', y=column)
    
    format_xtickLabels(violin_plot, students_df, 'parental level of education(categorical)', True)

    draw_median_on_violinplot(violin_plot=violin_plot, df=students_df, x='parental level of education(categorical)', y=column)

Overall, students with parents having a master's degree seem to be performing better than other students. But considering the shapes of the violins of all categories, I guess we can say that all students-groups are fairly close to each other in performance.

## By lunch(categorical):

In [None]:
sns.pairplot(students_df, hue='lunch(categorical)', diag_kind='kde')

Again, same kind of overlap in the scatter plots of the categories/classes.

In [None]:
plt.figure(figsize=(12, 5))
plt.subplots_adjust(wspace=1, bottom=0.2)

count_of_subplots = len(score_columns)
for i, column in enumerate(score_columns):
    
    plt.subplot(1, count_of_subplots, i + 1)
    
    violin_plot = sns.violinplot(data=students_df, x='lunch(categorical)', y=column)
    
    format_xtickLabels(violin_plot, students_df, 'lunch(categorical)', True)

    draw_median_on_violinplot(violin_plot=violin_plot, df=students_df, x='lunch(categorical)', y=column)

Though it seems that students with standard lunch are performing better than those with free/reduced lunch, as their bulk is around a higher-part of y-axis.

## By test preparation course(categorical):

In [None]:
sns.pairplot(students_df, hue='test preparation course(categorical)', diag_kind='kde')

Again, same kind of overlap visible in the scatter plots.

In [None]:
plt.figure(figsize=(12, 5))
plt.subplots_adjust(wspace=1, bottom=0.2)

count_of_subplots = len(score_columns)
for i, column in enumerate(score_columns):
    
    plt.subplot(1, count_of_subplots, i + 1)
    
    violin_plot = sns.violinplot(data=students_df, x='test preparation course(categorical)', y=column)
    
    format_xtickLabels(violin_plot, students_df, 'test preparation course(categorical)', True)

    draw_median_on_violinplot(violin_plot=violin_plot, df=students_df, x='test preparation course(categorical)', y=column)

Yet it seems that students who completed test preparation course have performed better, as their violin bulks around higher-part of the y-axis, as compared to students who didn't take the course.

#### Now, lets add numeric versions of the categorical columns, so that we can use them for finding correlation of categorical columns with other(numeric) columns

In [None]:
#  format of comments below:    (<numeric label 1>, <category 1>), (<numeric label 2>, <category 2>), .... and so on.

#  (0, 'some high school'), (1, 'high school'), (2, 'some college'), (3, "associate's degree"),
#  (4, "bachelor's degree"), (5, "master's degree")
students_df['parental level of education(numeric)'] = students_df['parental level of education(categorical)']
students_df['parental level of education(numeric)'].cat.categories = [3, 4, 1, 5, 2, 0]
students_df['parental level of education(numeric)'] = students_df['parental level of education(numeric)'].astype('int')

#  (0, 'female'), (1, 'male')
students_df['gender(numeric)'] = students_df['gender(categorical)']
students_df['gender(numeric)'].cat.categories = [0, 1]
students_df['gender(numeric)'] = students_df['gender(numeric)'].astype('int')

#  (0, 'group A'), (1, 'group B'), (2, 'group C'), (3, 'group D'), (4, 'groud E')
students_df['race/ethnicity(numeric)'] = students_df['race/ethnicity(categorical)']
students_df['race/ethnicity(numeric)'].cat.categories = [0, 1, 2, 3, 4]
students_df['race/ethnicity(numeric)'] = students_df['race/ethnicity(numeric)'].astype('int')

#  (0, 'free/reduced'), (1, 'standard')
students_df['lunch(numeric)'] = students_df['lunch(categorical)']
students_df['lunch(numeric)'].cat.categories = [0, 1]
students_df['lunch(numeric)'] = students_df['lunch(numeric)'].astype('int')

#  (0, 'none'), (1, 'completed')
students_df['test preparation course(numeric)'] = students_df['test preparation course(categorical)']
students_df['test preparation course(numeric)'].cat.categories = [1, 0]
students_df['test preparation course(numeric)'] = students_df['test preparation course(numeric)'].astype('int')

students_df.head()

In [None]:
plt.figure()
correlation_coeffs = students_df.corr()
#  mask gets rid of the same coefficients repeated in the triangle above the diagonal
mask = np.tril(np.ones(correlation_coeffs.shape)).astype('bool')
mask = ~mask

sns.heatmap(correlation_coeffs, mask=mask, annot=True, vmin=-1, vmax=1, cmap='viridis', annot_kws={'size':9})

Points to be noted:
<ul>
    <li>Math, reading, and writing scores have quite high correlation coefficients, especially for reading and writing scores i.e. 0.95
    </li>
    <li>
        The lowest of correlation coefficients are between gender and reading score, and gender and writing score i.e. -0.24 & -0.3. Recall that we set 'female' to 0 and 'male' to 1. So, negative correlations indicate that female students are better at reading and writing.
    </li>
</ul>