# This notbook is about the EDA of "Student performance in Exam" dataset
## Here I have used 5 libraries. Pandas, Seaborn, Matplot, LabelEncoder, OneHotEncoder
## I have performed various visualizations using the seaborn library to understand the data and how different variables affect the result and have written down the inferences.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
student_data = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")
student_data.head(5)

In [None]:
student_data.describe()

## The minimun and maximum marks for 
## Maths - 0, 100
## Reading - 17,100
## Writing - 10,100

In [None]:
student_data.isnull().sum()

In [None]:
student_data["gender"].unique()

In [None]:
student_data["race/ethnicity"].unique()

In [None]:
student_data["parental level of education"].unique()

In [None]:
student_data["lunch"].unique()

In [None]:
student_data["test preparation course"].unique()

## This shows the different categories of data present under each attribute.
#### For gender, 2. Female, Male.
#### For Race/Ethnicity. 5. Group A,B,C,D & E.
#### For Parental level of education. 6. Bachelor's degree, Some college, Master's degree, Associate's degree, High school, Some high school.
#### For Lunch. 2. Standard, Free/Reduced.
#### For Test preparation course. 2. None, Completed.

## Now we will visualize to make some inferences about the data and understand it more throughly

In [None]:
sns.displot(data=student_data, x="math score", hue="gender", col="test preparation course",multiple="stack")
sns.displot(data=student_data, x="reading score", hue="gender", col="test preparation course",multiple="stack")
sns.displot(data=student_data, x="writing score", hue="gender", col="test preparation course",multiple="stack")

### It shows that most of the students did not took any Test Preparation Course for the exam.
### It also shows that leaving math there were very less students who scored less than 35, who took the preparation course.
### It also shows that the female students have performed better than male students in all subjetcs irrespective of taken a course or not
### It also shows that the majority number of people scoring less than 20, who have not taken any course are girls.

In [None]:
sns.catplot(x="race/ethnicity", y="math score", data=student_data) 

### This shows that students from Group A, D and E scored more marks on average than Group B and C

In [None]:
sns.catplot(x="lunch", y="math score", order=["standard", "free/reduced"], data=student_data)
sns.catplot(x="lunch", y="reading score", order=["standard", "free/reduced"], data=student_data)
sns.catplot(x="lunch", y="writing score", order=["standard", "free/reduced"], data=student_data)

### The students with "Standard" lunch performed better than the students with "Free/Reduced" lunch category in all subjetcs.

In [None]:
sns.catplot(y="race/ethnicity", hue="test preparation course",
            col="gender",kind="count", data=student_data)

In [None]:
sns.relplot(
    data=student_data, x="writing score", y="reading score",
    col="race/ethnicity", hue="gender", style="test preparation course",
    kind="scatter", aspect =0.3, height=7
)

In [None]:
corr = student_data.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size': 8}, cmap='YlGnBu')
plt.plot()

### This shows that the reading score and writing score are co-related with each other.
### Basically students that score better in writing section performed better at reading section too.

In [None]:
sns.catplot(x="math score", y="parental level of education", kind="boxen",
            data=student_data.sort_values("math score"))

In [None]:
sns.catplot(x="parental level of education", y="math score", hue="test preparation course", kind="bar", data=student_data, aspect =3)

In [None]:
# making category variables and seeing how they are related
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
categ = ['gender','test preparation course','lunch']

# Encode Categorical Columns
le = LabelEncoder()
student_data[categ] = student_data[categ].apply(le.fit_transform)

In [None]:
student_data.head(5)

In [None]:
corr = student_data.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size': 8}, cmap='YlGnBu')
plt.plot()

### This shows how different variables affect the math, reading, writing scores of a student.