In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 00. Data

Data collected from: https://www.kaggle.com/spscientist/students-performance-in-exams

# 01. Importing libraries and loading the data

In [None]:
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

#loading the data
df = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")

#view the first five rows of the dataset
df.head()

# 02. Basic informations about the dataset

In [None]:
#number of rows and columns - (rows, columns)
df.shape

In [None]:
#visualizing the columns
df.columns

In [None]:
#informations about the dataset (type of each column, if there're null values)
df.info()

In [None]:
#basic description about the dataset (only for numeric columns)
df.describe()

In [None]:
#check median of math, reading and writing score
df.median()

In [None]:
#checking if there're NaN values
df.isnull().sum()

# 03. Feature engineering

Can I create new columns? Can I change something on the columns that already exists?

In [None]:
#view the first five rows of the dataset
df.head()

In [None]:
#creating a new column: mean of the three tests
df["mean score"] = ((df["math score"] + df["reading score"] + df["writing score"]) / 3).round()

In [None]:
#checking the new columns
df.head()

# 04. Exploratory Data Analysis (EDA)

## Getting more information about the data

In [None]:
#categorical columns
categorical_columns = ["gender","race/ethnicity","parental level of education","lunch","test preparation course"]

In [None]:
#count how many values there're for each category 
for i in categorical_columns:
    plt.figure(figsize=(10,6))
    sns.countplot(x=i, palette="rocket", data=df)

In [None]:
#numerical columns
numerical_columns = ["math score", "reading score", "writing score", "mean score"]

In [None]:
#viewing the distribution for each numerical column
for i in numerical_columns:
    plt.figure(figsize=(12,10));
    sns.distplot(df[i])

## Hypothesis test

### H1. Women get higher grades than men.

True.
* Math score: male > female
* Reading score: female > male
* Writing score: female > male

- In general: female > male

In [None]:
#group by gender and get the mean of each test for male and female
df1 = df.groupby(["gender"])[["math score", "reading score", "writing score", "mean score"]].mean()
df1

In [None]:
#reseting the index of df1, now "gender" is a column
df1 = df1.reset_index()
df1

In [None]:
#ploting the mean for each test
plt.figure(figsize=(14,8))
plt.subplot(1, 3, 1)
sns.barplot(x=df1["gender"], y=df1["math score"], palette="rocket")
plt.subplot(1, 3, 2)
sns.barplot(x=df1["gender"], y=df1["reading score"], palette="rocket")
plt.subplot(1, 3, 3)
sns.barplot(x=df1["gender"], y=df1["writing score"], palette="rocket");

In [None]:
#in general
plt.figure(figsize=(12,8))
sns.barplot(x=df1["gender"], y=df1["mean score"], palette="rocket");

### H2. The higher the grade on the math test, the higher the grade on the reading test.

True. For male and female.

In [None]:
#create a scatter plot to compare math score and reading score, for male and female
plt.figure(figsize=(12,6))
plt.title("Reading score x Math score")
sns.scatterplot(x=df["math score"], y=df["reading score"], hue=df["gender"], palette="rocket");

### H3. The higher the parents' level of education, the higher the student's grade.

True. It's possible to see that students with a higher parental level of education (bachelor's and master's) have higher scores.

In [None]:
df.head()

In [None]:
#group by parental level of education and see the mean for each test
df2 = df.groupby(["parental level of education"])[["math score", "reading score", "writing score", "mean score"]].mean()
df2

In [None]:
#reseting the index of df2, now "parental level of education" is a column
df2 = df2.reset_index()
df2

In [None]:
#plotting the scores by parental level of education
for i in ["math score", "reading score", "writing score"]:
    plt.figure(figsize=(12,8))
    sns.barplot(x=df2["parental level of education"], y=df2[i], palette="rocket", ci=False)

In [None]:
#in general
plt.figure(figsize=(12,8))
plt.title("Parental level education x Mean score of students")
sns.barplot(x=df2["parental level of education"], y=df2["mean score"], palette="rocket", ci=False);

### H4. Students who have completed the preparation course do better.

True. 

In [None]:
df.head()

In [None]:
#seeing unique values in that column
df["test preparation course"].unique()

In [None]:
#group by test preparation course and see the mean score for each category
df3 = df.groupby(["test preparation course"])[["math score", "reading score", "writing score", "mean score"]].mean()
df3

In [None]:
#reseting the index, now let's see
df3 = df3.reset_index()

In [None]:
#visualizing this dataframe
plt.figure(figsize=(14,8))
plt.subplot(1, 3, 1)
sns.barplot(x=df3["test preparation course"], y=df3["math score"], palette="mako")
plt.subplot(1, 3, 2)
sns.barplot(x=df3["test preparation course"], y=df3["reading score"], palette="mako")
plt.subplot(1, 3, 3)
sns.barplot(x=df3["test preparation course"], y=df3["writing score"], palette="mako");

In [None]:
#in general
plt.figure(figsize=(12,8))
sns.barplot(x=df3["test preparation course"], y=df3["mean score"], palette="mako");

### H5. Students who eat "Standard Lunch" get better grades.

True.

In [None]:
df.head()

In [None]:
#group by lunch and see the mean for each test
df4 = df.groupby(["lunch"])[["math score", "reading score", "writing score", "mean score"]].mean()
df4

In [None]:
#reseting index, now "lunch" is a column
df4 = df4.reset_index()

In [None]:
#visualizing this dataframe
plt.figure(figsize=(14,8))
plt.subplot(1, 3, 1)
sns.barplot(x=df4["lunch"], y=df4["math score"], palette="mako")
plt.subplot(1, 3, 2)
sns.barplot(x=df4["lunch"], y=df4["reading score"], palette="mako")
plt.subplot(1, 3, 3)
sns.barplot(x=df4["lunch"], y=df4["writing score"], palette="mako");

In [None]:
#in general
plt.figure(figsize=(12,8))
sns.barplot(x=df4["lunch"], y=df4["mean score"], palette="mako");

### H6. The higher the grade in the reading test the higher the grade in the writing test.

True. For male and female.

In [None]:
plt.figure(figsize=(12,6))
plt.title("Reading score x Writing score")
sns.scatterplot(x=df["reading score"], y=df["writing score"], hue=df["gender"], palette="rocket");

### H7. Race / ethnicity does not influence students' grades.

False. <br>
Group E > Group D > Group C > Group B > Group A

In [None]:
df.head()

In [None]:
df5 = df.groupby(["race/ethnicity"])[["math score", "reading score", "writing score", "mean score"]].mean()
df5

In [None]:
#reseting the index, now "race/ethnicity" is a column
df5 = df5.reset_index()

In [None]:
#visualizing this dataframe
for i in ["math score", "reading score", "writing score"]:
    plt.figure(figsize=(12,8))
    sns.barplot(x=df5["race/ethnicity"], y=df5[i], palette="viridis", ci=False)

In [None]:
#in general
plt.figure(figsize=(12,8))
sns.barplot(x=df5["race/ethnicity"], y=df5["mean score"], palette="viridis", ci=False);

## Visualizing the correlation between the variables

In [None]:
#correlation between the scores
corr_matrix = df.corr()
corr_matrix

In [None]:
#visualizing corr_matrix
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=True);

# 05. Feature encoding

In [None]:
#seeing the columns, we want the categorical ones
df.columns

In [None]:
X = df.drop("mean score", axis=1)
y = df["mean score"]

In [None]:
# categorical boolean mask
categorical_feature_mask = X.dtypes==object

# filter categorical columns using mask and turn it into a list
categorical_cols = X.columns[categorical_feature_mask].tolist()

categorical_cols

In [None]:
#import labelencoder
from sklearn.preprocessing import LabelEncoder

#instantiate labelencoder object
le = LabelEncoder()

# apply le on categorical feature columns
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))
X[categorical_cols].head(10)

In [None]:
le.classes_

# 06. Preparing the data

In [None]:
#checking X
X.head()

In [None]:
#checking y
y.head()

In [None]:
#importing train_test_split
from sklearn.model_selection import train_test_split

#creating X_train and y_test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.20)

#seeing shape of X_train, X_test, y_train, y_test
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# 07. Fitting the model

### RandomForestRegressor

In [None]:
#importing the RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

#instantiating the model
model = RandomForestRegressor()

#fitting the model
model.fit(X_train, y_train)

# 08. Making Prediction

### RandomForestRegressor

In [None]:
#making predictions
y_preds = model.predict(X_test)

In [None]:
#seeing the predictions
y_preds[:50]

# 09. Evaluating the model

### RandomForestRegressor

In [None]:
#evaluating a model using score()
model.score(X_test, y_test)

In [None]:
#evaluting a model using cross-validation 
from sklearn.model_selection import cross_val_score

#cross val score
scores = cross_val_score(estimator=model, 
                      X=X, 
                      y=y, 
                      cv=5, # use 5-fold cross-validation
                      scoring=None)

#printing the mean of all scores
print(scores.mean())

In [None]:
#r^2 (pronounced r-squared) or coefficient of determination
from sklearn.metrics import r2_score
print(r2_score(y_test, y_preds))

#mean absolute error (MAE)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_preds))

#mean square error (MSE)
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_preds))