In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# importing some libraries that will help me plot the data
import seaborn as sns
import matplotlib.pyplot as plt

# importing libraries to preprocess the data
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


Reading the data:

In [None]:
data = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")
data.head()

In [None]:
sns.set_palette([ 'pink','navy'])
sns.set_context("poster", font_scale = 0.8)
plt.figure(figsize=(9,9))
sns.countplot(data['gender'])
plt.xlabel("Gender")
plt.ylabel("Count")

Here I am comparing the qualifications of the student's parents:

In [None]:
sns.set_palette("GnBu_d")
sns.set_context("poster", font_scale = 0.6)
plt.figure(figsize=(15,7))
sns.countplot(data['parental level of education']) 
plt.xlabel('Parental Level of Education')
plt.ylabel('Count')

Below is a scatterplot of reading and writing scores of the students:

In [None]:
sns.set_context("poster", font_scale = 0.8)
plt.figure(figsize=(9,9))
sns.set_palette([ 'orange','red'])
sns.scatterplot(data=data, x='reading score', y='writing score', hue='test preparation course' )

Here I see a linear relationship between the reading and the writing scores which is pretty obvious given that a student who reads better can also write better and vice-versa. What I also notice is that low scorers did not complete their test preparation course, even the ones whose preparation is none also score high.

In [None]:
sns.set_context("poster", font_scale = 0.8)
plt.figure(figsize=(9,9))
sns.set_palette([ 'pink','navy'])
sns.scatterplot(data=data, x='math score', y='writing score', hue='gender' )

What I can interpret from this scatterplot is that the female students score more in writing scores than math scores as compared to men, this can be seen in the plot as the points representing females are more inclined towards the writing scores than the maths scores.

Finding the categorical columns now so that they can be converted to numerical categories in the next step using the LabelEncoder:

In [None]:
c = (data.dtypes == 'object')
cat_col = list(c[c].index)
print(cat_col)

We encode the categorical columns for the purpose of finding out their correlation and interpreting them in the form of a heatmap for better understanding.

In [None]:
enc = LabelEncoder()

for col in cat_col:
    data[col] = enc.fit_transform(data[col])

data.head()

That is a pretty bad accuracy. Lets find the correlation between the features:


In [None]:
plt.figure(figsize=(10,10))
sns.set_context("poster", font_scale = 0.5)
sns.heatmap(data.corr(), cmap="Blues")

Looking at this heatmap, it is obvious that there is a high correlation between the maths, reading and writing scores.
What is also noticeable is the little correlation between the math scores and ethnicity of the student. There is also a small corelation between the type of lunch that the students get and their math scores.

Lets plot a histogram for the math scores as per math scores and ethnicity of the students:

In [None]:
data = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")

In [None]:
sns.catplot(data=data, x='race/ethnicity',y='math score', hue='gender', kind="bar")

There is a small difference in the math scores of the females and males as well in between the ethnicities.

Lets predict the gender of the student using the data now:

In [None]:
columns = data.columns
enc = LabelEncoder()
for col in cat_col:
    data[col] = enc.fit_transform(data[col])

encdata = pd.DataFrame(data, columns=columns)    

y = encdata['gender']
encdata.drop(['gender'], inplace=True, axis=1)

xtrain, xtest, ytrain, ytest  = tts(encdata, y, train_size=0.7, test_size=0.3)

In [None]:
model = SVC()
model.fit(xtrain, ytrain)

preds = model.predict(xtest)

print("The Accuracy of this model is:", accuracy_score(preds, ytest)*100,"%")

I think that an accuracy of 88% is not exactly bad given that you cannot really judge from the data whether the student is going ot be a male or a female, making the model work better than the human intuition which I consider as decent.
I hope you liked this notebook! Upvote it if it helped you :,)