In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()
# sns.set_style(style="darkgrid")
# sns.despine()

import warnings
warnings.filterwarnings(action="ignore")

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list 
# the files in the input directory

import os
print(os.listdir("../input"))

In [None]:
data = pd.read_csv("../input/StudentsPerformance.csv")

In [None]:
data.head()

We change the column names a bit for our ease of use. You can give any relevant name you want.

In [None]:
data.columns = ["gender", "race_ethnicity", "parent_education", "lunch", "test_preparation", "maths_score", "reading_score", "writing_score"]
data.columns

In [None]:
data.info()

Most of our columns are categorical (object type) except for scores which are integer values.

In [None]:
data.describe()

describe() method only tells about the numerical data. Statistics like mean, median (where is it? ), min and max values etc.

In [None]:
data.isnull().sum()

We don't have any missing, null or NaN values in our data, which is good.

Let's see how our categorical variables are?

In [None]:
for column in data.columns:
    if column == "maths_score":
        break
    print(column.upper())
    print(data[column].value_counts())
    print("\n\n\n")

We can see different categories and the counts of different columns. Gender has male and female. There are five categories in race_ethnicity where group A has the least count. There are very less number of parents wo have master's degree and rest you can figure out.

In [None]:
plt.figure(figsize=(20, 10))
sns.countplot(x="race_ethnicity", data=data, hue="gender", order=["group A", "group B", "group C", "group D", "group E"])

Overall, females numbers are more than males, as we could see before. Group B and group C follow the same trend. Although other groups have less number of females comparatively. 

In [None]:
plt.figure(figsize=(20, 10))
sns.countplot(x="race_ethnicity", data=data, hue="parent_education", order=["group A", "group B", "group C", "group D", "group E"])

This give us the count of parents having various degrees in different groups. We can actually scale this data accroding to groups. We will plot percentage degree holders in a particular race.

In [None]:
race = sorted(data["race_ethnicity"].value_counts().index)
p_education = sorted(data["parent_education"].value_counts().index)
df = {"race_ethnicity":[], "parent_education":[], "per":[]}
for col in race:
    d = data[data["race_ethnicity"] == col].shape[0]
    d = d*1.0
    for edu in p_education:
        n = data[(data["race_ethnicity"] == col) & (data["parent_education"] == edu)].shape[0]
        df["race_ethnicity"].append(col)
        df["parent_education"].append(edu)
        df["per"].append(round((n/d)*100, 2))

df = pd.DataFrame(data=df)
df.head(10)

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(x="race_ethnicity", y="per", data=df, hue="parent_education", order=["group A", "group B", "group C", "group D", "group E"])

Let's plot something interesting. We will plot for a particular degree, how its percenatge is distributed among the different groups. I will explain it more once we create the graph.

In [None]:
race = sorted(data["race_ethnicity"].value_counts().index)
p_education = sorted(data["parent_education"].value_counts().index)
df = {"race_ethnicity":[], "parent_education":[], "per":[]}
for edu in p_education:
    d = data[data["parent_education"] == edu].shape[0]
    d = d*1.0
    for col in race:
        n = data[(data["race_ethnicity"] == col) & (data["parent_education"] == edu)].shape[0]
        df["race_ethnicity"].append(col)
        df["parent_education"].append(edu)
        df["per"].append(round((n/d)*100, 2))

df = pd.DataFrame(data=df)
df.head()

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(x="race_ethnicity", y="per", data=df, hue="parent_education", order=["group A", "group B", "group C", "group D", "group E"])

Wow!! We see that most master's degree parents are from C and D. Group C actually has a sort of uniform contribution in every degree.

Let's point poinplot for this above data.

In [None]:
plt.figure(figsize=(20, 10))
sns.pointplot(x="race_ethnicity", y="per", data=df, hue="parent_education", order=["group A", "group B", "group C", "group D", "group E"])

In [None]:
plt.figure(figsize=(20, 10))
g = sns.countplot(x="parent_education", data=data, hue="race_ethnicity")
g.set_xticklabels(labels=g.get_xticklabels(), rotation=45)

For a particular degree, which group has most number of counts. Clearly we can see that group C and group D have the competition.

In [None]:
new_df = get_format(x="parent_education", y="race_ethnicity", data=data)
new_df.head()

In [None]:
sns.countplot(x="lunch", data=data)

The distribution is almost of 2/3 adn 1/3.

In [None]:
sns.countplot(x="lunch", data=data, hue="gender")

In [None]:
sns.countplot(x="lunch", data=data, hue="race_ethnicity")

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(x="lunch", data=data, hue="parent_education")

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(x="test_preparation", data=data, hue="race_ethnicity")

We can see for each ethnic group, greater number of students have no preparation than complete preparation.

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(x="test_preparation", data=data, hue="parent_education")

Even for each parent's education category, we can see that most number of people have no test preparation. So test preparation doesn't depend upon parents education.

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x="race_ethnicity", y="maths_score", data=data, hue="gender", palette="Set1")

In each group we can see that median value of maths score is larger for males than females. Athough, there is some variance in that data. So can we say boys are better in maths than girls.

In [None]:
plt.figure(figsize=(15,8))
sns.boxenplot(x="race_ethnicity", y="maths_score", data=data, hue="gender", palette="Set1")

In [None]:
plt.figure(figsize=(15,8))
sns.violinplot(x="race_ethnicity", y="maths_score", data=data, hue="gender", palette="Set1")

In [None]:
plt.figure(figsize=(15,6))
sns.violinplot(x="race_ethnicity", y="reading_score", data=data, hue="gender", palette="Set1")

For reading score, girls are better than boys. Woah! The competition really got tough now.

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(x="race_ethnicity", y="writing_score", data=data, hue="gender", palette="Set1")

Finally, girls are the winner I guess. :)

In [None]:
plt.figure(figsize=(15,8))
sns.boxenplot(x="parent_education", y="maths_score", data=data, palette="Set1")

Can you notice that children of parents having master's degree hav higher marks in maths. Does this tredn follow for other score also. Let's see.

In [None]:
plt.figure(figsize=(20,20))
plt.subplot(2, 2, 1)
sns.boxplot(x="parent_education", y="maths_score", data=data, palette="Set1")
plt.xticks(rotation=30)
plt.subplot(2, 2, 2)
sns.boxplot(x="parent_education", y="reading_score", data=data, palette="Set1")
plt.xticks(rotation=30)
plt.subplot(2, 2, 3)
sns.boxplot(x="parent_education", y="writing_score", data=data, palette="Set1")
plt.xticks(rotation=30)

Amidst some variance in the values, we can say master's degree parent's children do better in maths than other degrees.

In [None]:
plt.figure(figsize=(20,20))
plt.subplot(2, 2, 1)
sns.barplot(x="parent_education", y="maths_score", data=data, palette="Set1", estimator=np.median)
plt.xticks(rotation=30)
plt.subplot(2, 2, 2)
sns.barplot(x="parent_education", y="reading_score", data=data, palette="Set1", estimator=np.median)
plt.xticks(rotation=30)
plt.subplot(2, 2, 3)
sns.barplot(x="parent_education", y="writing_score", data=data, palette="Set1", estimator=np.median)
plt.xticks(rotation=30)

In [None]:
binsize=15
plt.figure(figsize=(20,20))
plt.subplot(2, 2, 1)
sns.distplot(a=data["maths_score"], bins=binsize, hist=True)
plt.xticks(rotation=30)
plt.subplot(2, 2, 2)
sns.distplot(a=data["reading_score"], bins=binsize, hist=True)
plt.xticks(rotation=30)
plt.subplot(2, 2, 3)
sns.distplot(a=data["writing_score"], bins=binsize, hist=True)
plt.xticks(rotation=30)

We can see in above plot, how the scores are varied.

We can see how that distribution for different genders.

In [None]:
# binsize=15
# plt.figure(figsize=(20,20))
# plt.subplot(2, 2, 1)
# sns.FacetGrid(data, hue="gender", size=5).map(sns.kdeplot, "maths_score").add_legend()
# plt.xticks(rotation=30)
# plt.subplot(2, 2, 2)
# sns.FacetGrid(data, hue="gender", size=5).map(sns.kdeplot, "reading_score").add_legend()
# plt.xticks(rotation=30)
# plt.subplot(2, 2, 3)
# sns.FacetGrid(data, hue="gender", size=5).map(sns.kdeplot, "writing_score").add_legend()

We can plot the one dimensional KDE plot for other scores in different genders.

In [None]:
sns.FacetGrid(data, hue="gender", size=5).map(sns.kdeplot, "maths_score").add_legend()

In [None]:
sns.FacetGrid(data, hue="gender", size=5).map(sns.kdeplot, "reading_score").add_legend()

In [None]:
sns.FacetGrid(data, hue="gender", size=5).map(sns.kdeplot, "writing_score").add_legend()

We can see that for reading and writing, girls have more marks than boys and boys have more marks in maths.


There are few more plots, which we can plot to confirm the trend we have.

In [None]:
plt.figure(figsize=(20,20))
plt.subplot(2, 2, 1)
sns.boxplot(x="gender", y="maths_score", data=data, palette="Set1")
plt.xticks(rotation=30)
plt.subplot(2, 2, 2)
sns.boxplot(x="gender", y="reading_score", data=data, palette="Set1")
plt.xticks(rotation=30)
plt.subplot(2, 2, 3)
sns.boxplot(x="gender", y="writing_score", data=data, palette="Set1")
plt.xticks(rotation=30)

In [None]:
plt.figure(figsize=(20,20))
plt.subplot(2, 2, 1)
sns.swarmplot(x="gender", y="maths_score", data=data, palette="Set1")
plt.xticks(rotation=30)
plt.subplot(2, 2, 2)
sns.swarmplot(x="gender", y="reading_score", data=data, palette="Set1")
plt.xticks(rotation=30)
plt.subplot(2, 2, 3)
sns.swarmplot(x="gender", y="writing_score", data=data, palette="Set1")
plt.xticks(rotation=30)

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
sns.boxplot(y="maths_score", data=data, palette="Set1")
plt.xticks(rotation=30)
plt.subplot(1, 3, 2)
sns.boxplot(y="reading_score", data=data, palette="Set1")
plt.xticks(rotation=30)
plt.subplot(1, 3, 3)
sns.boxplot(y="writing_score", data=data, palette="Set1")
plt.xticks(rotation=30)

We can see that in terms of scores, girls won it by 2-1.

There are few things I have left in this kernel. For example, we could normalize the population counts and rather than showing absolute counts in different barplot or countplot,  we can show relevant percentages, which is better I guess. Also, I did not create any stacked graph which I guess would be good here. 

If you plot any of these graphs or any new variation you bring, leave a comment. I would be happy to see it.

Do let me know what you think about this kernel of mine. Any suggestion or feedback is highly welcomed.

If you want to see some more seaborn plots I have plotted them [here](https://www.kaggle.com/gadaadhaarigeek/another-eda-on-iris-dataset). You can check out seaborn's official docs, which has many awesome plots.


Happy Visualizing :)