In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")

Quick glance of the data

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

Visual Representation of Data

In [None]:
sns.countplot(x='gender',data=df)

In [None]:
sns.countplot(x='race/ethnicity',data=df)

In [None]:
fig = plt.figure(figsize=(12,6))
sns.countplot(x='parental level of education',data=df)
fig.show()

In [None]:
sns.countplot(x='lunch',data=df)

In [None]:
sns.countplot('test preparation course',data=df)

Distribution of target variables

In [None]:
sns.distplot(df['math score'])

In [None]:
sns.distplot(df['reading score'])

In [None]:
sns.distplot(df['writing score'])

Relation of target variables

In [None]:
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')

In [None]:
#Student performing good on one subject is expected to score good in the remaining too subjects

In [None]:
sns.scatterplot(x='math score',y='reading score',data=df)

In [None]:
sns.scatterplot(x='reading score',y='writing score',data=df)

In [None]:
sns.scatterplot(x='math score',y='writing score',data=df)

Comparision of marks based on various factors

In [None]:
df.groupby(by='gender').mean()

In [None]:
df.groupby(by='gender').mean().plot.bar()

Female students tend to score more marks in reading and writing while male students score more in maths.

In [None]:
df.groupby(by='race/ethnicity').mean()

In [None]:
df.groupby(by=['race/ethnicity']).mean().plot.bar()

Students of race/ethnicity of group E tend to score more marks in all subjects than students of other groups.

In [None]:
df.groupby(by='parental level of education').mean()

In [None]:
df.groupby(by=['parental level of education']).mean().plot.bar()

Students score tend to increase directly as per their parent's education. Parents having master's degree have a significant
impact on students to score more than those parents who only have a high school degree.

In [None]:
df.groupby(by='lunch').mean()

In [None]:
df.groupby(by=['lunch']).mean().plot.bar()

Students scoring more who pay standard fee for their lunch can be reasoned by the fact that they might be capable of having better study environment due to their financial status. This relates directly with the parental level of education as higher educations usually leads to higher salary and financial stability.

In [None]:
df.groupby(by=['test preparation course']).mean()

In [None]:
df.groupby(by=['test preparation course']).mean().plot.bar()

As expected, students completing test preparation course score more than students who do not.

In [None]:
#End of Explorary Data Analysis

Machine Learning using Random Forest

Importing libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error

Preparing data for Machine Learning

In [None]:
df1 = pd.get_dummies(data=df,drop_first=True)

In [None]:
df1.head()

Separating and Splitting the data

In [None]:
X = df1.drop(['math score','reading score','writing score'],axis=1)
y_maths = df1['math score']
y_reading = df1['reading score']
y_writing = df1['writing score']

In [None]:
X_train, X_test, y_train_maths, y_test_maths = train_test_split(X, y_maths, test_size=0.33, random_state=42)
X_train, X_test, y_train_reading, y_test_reading = train_test_split(X, y_reading, test_size=0.33, random_state=42)
X_train, X_test, y_train_writing, y_test_writing = train_test_split(X, y_writing, test_size=0.33, random_state=42)

Creating model

In [None]:
rfr = RandomForestRegressor(min_samples_leaf=20)

Training the data

For maths

In [None]:
rfr.fit(X_train,y_train_maths)
pred_maths = rfr.predict(X_test)
print(mean_absolute_error(y_test_maths,pred_maths))

For reading

In [None]:
rfr.fit(X_train,y_train_reading)
pred_reading = rfr.predict(X_test)
print(mean_absolute_error(y_test_reading,pred_reading))

For writing

In [None]:
rfr.fit(X_train,y_train_writing)
pred_writing = rfr.predict(X_test)
print(mean_absolute_error(y_test_writing,pred_writing))

End of Machine Learning Section