#Student Grade Prediction using Random Forest Regressor

#Import libraries 

In [None]:
import pandas as pd  #data processing
import numpy as np #linear algebra
import matplotlib.pyplot as plt #data visualization
import seaborn as sns #statistical data visualization
import sklearn as sk #machine learning model

#Import dataset

In [None]:
df = pd.read_csv('/kaggle/input/student-grade-prediction/student-mat.csv')

#Exploratory data analysis

In [None]:
df.shape #view dimensions of dataset

In [None]:
df.head() #preview the dataset

In [None]:
df.describe() #descriptive statistics

####Missing values

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(df.isnull(),cbar=False,cmap='Blues_r',yticklabels=False, fmt='.1g')
plt.title('Missing values in the dataset', size=15)

#SAVE FIGURE
#from google.colab import files
#plt.savefig("missingvalue.png")
#files.download("missingvalue.png") 

####Outliers (numerical features)

In [None]:
plt.figure(figsize=(15,15))
sns.set_theme(style="ticks", font_scale=1.1)
sns.boxplot(data=df, orient='h', palette = 'Blues')
plt.title('Outliers in the dataset', size=15)

#Exploratory graph analysis

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.kdeplot(df['G1'], color='#B6D0E0', linewidth=2, label='G1', cut=0)
sns.kdeplot(df['G2'], color='#B6D0E0', linewidth=2, linestyle='dashed', label='G2', cut=0)
sns.kdeplot(df['G3'], color='#08306B', linewidth=2, label='G3', cut=0)

plt.xlabel('Grade distribution', fontsize = 14)
plt.ylabel('Distribution (%)', fontsize = 14)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
b = sns.countplot(x=df['G3'], color='#08306B')
b.set_xlabel('G3', fontsize = 14)
b.set_ylabel('Number of students', fontsize = 14)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
b = sns.countplot(x=df['age'], color='#08306B')
b.set_xlabel('Age', fontsize = 14)
b.set_ylabel('Number of students', fontsize = 14)
plt.show()

In [None]:
plt.figure(figsize=(8,5))
b = sns.countplot(x=df['sex'], color='#08306B')
b.set_xlabel('Sex', fontsize = 14)
b.set_ylabel('Number of students', fontsize = 14)
plt.show()

In [None]:
plt.figure(figsize=(7,8))
b = sns.boxplot(x='sex', y='G3', data=df, color="#08306B")
#b.axes.set_title('Sexo vs Nota final (G3)', fontsize = 20)
b.set_xlabel('Sex', fontsize = 14)

##Feature Engineering



In [None]:
df.dtypes #check data types

In [None]:
categorical = ['school',\
               'sex',\
               'address',\
               'famsize',\
               'Pstatus',\
               'Mjob',\
               'Fjob',\
               'reason',\
               'guardian',\
               'schoolsup',\
               'famsup',\
               'paid',\
               'activities',\
               'nursery',\
               'higher',\
               'internet',\
               'romantic',
                ]

In [None]:
#transform non-numerical labels to numerical labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df[categorical] = df[categorical].apply(lambda col: le.fit_transform(col))
df[categorical].head()

#Feature selection

####Correlation matrix (Spearman)

In [None]:
corr = df.corr(method='spearman') 
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(32, 32))
    ax = sns.heatmap(corr,mask=mask,square=True,linewidths=.6,cmap="Blues",annot=True)
plt.title('Correlation Matrix (Spearman)', size=19)

In [None]:
most_correlated = df.corr().abs()['G3'].sort_values(ascending=False)
most_correlated

####Chi-2

In [None]:
X = df.drop('G3', axis=1)
y = df['G3']

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

chi_scores = chi2(X, y)
chi_scores

In [None]:
p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = True , inplace = True)
p_values

In [None]:
p_values.plot.bar(figsize = (9,5), cmap="Blues_r")
plt.title('Chi-square test for feature selection', size=18) 


In [None]:
plt.figure(figsize=(6,6))
sns.regplot('failures', 'G3', data=df)
plt.show()

#Train test split

In [None]:
from sklearn.model_selection import train_test_split

df2 = df[['G1', 'G2', 'G3', 'absences']]

X = df2.drop('G3', axis=1) #independent variable 
y = df2['G3'] #dependent variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [None]:
X_train.shape, X_test.shape #check the shape of X_train and X_test

##Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=10, random_state=0)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
from sklearn.metrics import r2_score

print(r2_score(y_test, y_pred))

In [None]:
from sklearn.metrics import mean_squared_error

print(mean_squared_error(y_test, y_pred))

In [None]:
plt.figure(figsize=(6,6))

sns.regplot(y_test, y_pred)
plt.xlabel('Y Test')
plt.ylabel('Y Predicted')
plt.show()

In [None]:
sns.displot(y_test)
plt.title("Random Forest Regressor plot")
plt.show()