In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
df.info()
df.head()

In [None]:
df.isnull().sum()

# Exploratory Data Analysis

In [None]:
plt.figure(figsize = (12,10))
sns.countplot(df['gender'])

Around an even distribution between Male and Female Students

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(df['race/ethnicity'], hue = df['gender'])

Around an even distribution of male and females for every gender.

In [None]:
df['lunch'].nunique()

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(df['lunch'])

Getting a free/reduced lunch vs standard lunch may play a part in overall test scores

In [None]:
df['test preparation course'].nunique()

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(df['test preparation course'])

More students didn't prepare for their tests than did 

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(df['race/ethnicity'])

Group C and D seem to be the top two most common races in the data set

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(df['race/ethnicity'], hue=df['lunch'])

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(df['race/ethnicity'], hue=df['test preparation course'])

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot = True, cmap = 'viridis')

As expected there is a high correlation between reading and writing scores, both being subsets of the language category

In [None]:
plt.figure(figsize=(12,10))
sns.pairplot(df)

# Data Manipulation

In [None]:
# Creating numerical values for categorical data

In [None]:
df.info()

In [None]:
# Creating a list of all the object dtypes
objects = list(df.dtypes[df.dtypes == "object"].index)

In [None]:
# One Hot Encoding Categorical data
df = pd.get_dummies(data = df, columns = objects, drop_first=True)

In [None]:
df.head()

In [None]:
# Lets create a copy of the df because we will be predicting for all of the different subject scores 

math_df = df.copy()

# Random Forest Model (Math Score Prediction)

In [None]:
X = math_df.drop('math score', axis = 1).values
y = math_df['math score']

In [None]:
# Splitting Data into Training and Test Sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
# Scaling Data
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Creating Random Forest Model
rf = RandomForestRegressor(n_estimators=200)

In [None]:
# Training Model
rf.fit(X_train, y_train)

In [None]:
# Predictions
predictions = rf.predict(X_test)

In [None]:
# Evaluating Model

from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
print('MAE: ', mean_absolute_error(y_test, predictions))

In [None]:
print('MSE: ', mean_squared_error(y_test, predictions))

In [None]:
print('RMSE: ', np.sqrt(mean_squared_error(y_test, predictions)))

In [None]:
plt.figure(figsize=(12,10))
sns.distplot((y_test-predictions), bins = 50, color='gray')

Approximately a normal curve of the residuals which means it was a good model

# Random Forest Model (Reading Score Prediction) 

In [None]:
reading_df = df.copy()

In [None]:
X = reading_df.drop("reading score", axis = 1).values
y = reading_df["reading score"].values

In [None]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
#Scale Data with StandardScaler

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Creating Model
rf = RandomForestRegressor(n_estimators=200)

In [None]:
# Training model
rf.fit(X_train, y_train)

In [None]:
#Predictions

predictions = rf.predict(X_test)

In [None]:
# Evaluating Model

In [None]:
print('MAE: ', mean_absolute_error(y_test, predictions))
print('MSE: ', mean_squared_error(y_test, predictions))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, predictions)))

In [None]:
plt.figure(figsize=(12,10))
sns.distplot((y_test-predictions), bins = 50, color='gray')

Model's residuals is once again a normal curve meaning the model was a good fit. 

# Random Forest Model (Writing Score Prediction)

In [None]:
X = reading_df.drop("writing score", axis = 1).values
y = reading_df["writing score"].values

In [None]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
#Scale Data with StandardScaler

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Creating Model
rf = RandomForestRegressor(n_estimators=200)

In [None]:
# Training model
rf.fit(X_train, y_train)

In [None]:
#Predictions

predictions = rf.predict(X_test)

In [None]:
# Evaluating the Model

In [None]:
print('MAE: ', mean_absolute_error(y_test, predictions))
print('MSE: ', mean_squared_error(y_test, predictions))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, predictions)))

In [None]:
plt.figure(figsize=(12,10))
sns.distplot((y_test-predictions), bins = 50, color='gray')

Residuals are a normal curve meaning model was a good fit.

# Conclusion

In [None]:
print("Math min score was: {} ".format(df['math score'].min()))
print("Math max score was: {} ".format(df['math score'].max()))
print("Reading min score was: {} ".format(df['reading score'].min()))
print("Reading max score was: {} ".format(df['reading score'].max()))
print("Writing min score was: {} ".format(df['writing score'].min()))
print("Writing max score was: {} ".format(df['writing score'].max()))

The Random Forest Model predicted quite accurately for the Math, Reading, and Writing Scores. The Writing Scores were predicted the most accurately followed by the Reading Scores, and lastly the Math Scores. Considering the large range between scores, the model predicted quite accurately having an error average of around 4 - 6% for all of the subject scores.

# Please let me know what you think!

# Thank You!!!