In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, max_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

import catboost
from catboost import CatBoostRegressor


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Data preprocessing**

In [None]:
data = pd.read_csv('../input/student-grade-prediction/student-mat.csv')
data.head()

Let's see what types of data are in the dataset:

In [None]:
data.info()

We can see that Medu and Fedu (mother's education and father's education) was interpreted as numeric features while they are categorical features. So we need to change their type:

In [None]:
data['Medu'] = data['Medu'].astype('object')
data['Fedu'] = data['Fedu'].astype('object')

Let's create a list with categorical features' names

In [None]:
cat_features = data.dtypes[data.dtypes == 'object']
cat_features = pd.DataFrame(data=cat_features)
cat_features = list(set(cat_features.index))
cat_features

In [None]:
for cat_feat in cat_features:
    print(cat_feat)
    print(data[cat_feat].value_counts())
    print()

# **Visualisation**

In [None]:
# Correlation matrix
corr = data.corr()
plt.figure(figsize=(15,15))
sns.heatmap(corr, annot=True)

In [None]:
#Walc and Dulc are strongly correlated with each other so we remove one of them from dataset
data = data.drop(columns=['Walc'])

We also remove observes where the grade is equal to 0 because it might mean that a student just didn't turn up on the examination:

In [None]:
data0 = data[(data['G1'] > 0)].copy()

There are some useful functions for easy graphs making below. 

In [None]:
def get_grades(data):
  G1 = data.loc[:, ['G1', 'sex', 'school']]
  G1['semester'] = '1'
  G1.columns = ['grade', 'sex', 'school', 'semester']

  G2 = data.loc[:, ['G2', 'sex', 'school']]
  G2['semester'] = '2'
  G2.columns = ['grade', 'sex', 'school', 'semester']

  G3 = data.loc[:, ['G3', 'sex', 'school']]
  G3['semester'] = '3'
  G3.columns = ['grade', 'sex', 'school', 'semester']

  G = pd.concat([G1, G2, G3])
  return(G)

In [None]:
def name_of_axe(var):

  #object variables
  if var == 'reason':
    TITLE_VAR = 'Reason to choose this school'
  elif var == 'sex':
    TITLE_VAR = 'Sex'
  elif var == 'address':
    TITLE_VAR = 'Student\'s home address type'
  elif var == 'school':
    TITLE_VAR = 'School'
  elif var == 'famsize':
    TITLE_VAR = 'Family size'
  elif var == 'Pstatus':
    TITLE_VAR = 'Parent\'s cohabitation status'
  elif var == 'Mjob':
    TITLE_VAR = 'Mother\'s job'
  elif var == 'Fjob':
    TITLE_VAR = 'Father\'s job'  
  elif var == 'guardian':
    TITLE_VAR = 'student\'s guardian'
  elif var == 'famsup':
    TITLE_VAR = 'Family educational support '
  elif var == 'paid':
    TITLE_VAR = 'Extra paid classes within the course subject'
  elif var == 'schoolsup':
    TITLE_VAR = 'Extra educational support'    
  elif var == 'nursery':
    TITLE_VAR = 'Attended nursery school'
  elif var == 'activities':
    TITLE_VAR = 'Extra-curricular activities'
  elif var == 'semester':
    TITLE_VAR = 'Semester'
  elif var == 'Medu':
    TITLE_VAR = 'Mother\'s education'
  elif var == 'Fedu':
    TITLE_VAR = 'Father\'s education'
  elif var == 'higher':
    TITLE_VAR = 'Wants to take higher education'
  elif var == 'internet':
    TITLE_VAR = 'Internet access at home'
  elif var == 'romantic':
    TITLE_VAR = 'With a romantic relationship'
    
  #numeric variables
  elif var == 'age':
    TITLE_VAR = 'Age'
  elif var == 'traveltime':
    TITLE_VAR = 'Home to school travel time '
  elif var == 'studytime':
    TITLE_VAR = 'Weekly study time'
  elif var == 'failures':
    TITLE_VAR = 'Number of past class failures'
  elif var == 'famrel':
    TITLE_VAR = 'Quality of family relationships'
  elif var == 'freetime':
    TITLE_VAR = 'Free time after school'
  elif var == 'goout':
    TITLE_VAR = 'Going out with friends'
  elif var == 'health':
    TITLE_VAR = 'Current health status'
  elif var == 'absences':
    TITLE_VAR = 'Number of school absences'
    
  # resultive variables
  elif var == 'G1':
    TITLE_VAR = 'First period grade'
  elif var == 'G2':
    TITLE_VAR = 'Second period grade'
  elif var == 'G3':
    TITLE_VAR = 'Final grade'
  
  else:
    print('ERROR: there is no variable with given name')
    
  return(TITLE_VAR)

In [None]:
def make_barplot(data, X, Y, HUE, PALETTE, type):
  if type == 'typical':
    TITLE_X = name_of_axe(X)
    TITLE = name_of_axe(Y)

    if HUE is not None:
      TITLE_HUE = name_of_axe(HUE)

      sns.set_theme(style="whitegrid", palette=PALETTE)
      ax = sns.barplot(x=X, y=Y, hue=HUE, data=data)
      ax.axes.set_title(TITLE, fontsize=16)
      ax.set_xlabel(TITLE_X, fontsize=14)
      ax.set_ylabel("Mean",fontsize=14)
      plt.gca().legend(bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure,
                      loc='upper left').set_title(TITLE_HUE)
      plt.show()

    else:
      sns.set_theme(style="whitegrid", palette=PALETTE)
      ax = sns.barplot(x=X, y=Y, data=data)
      ax.axes.set_title(TITLE, fontsize=16)
      ax.set_xlabel(TITLE_X,fontsize=14)
      ax.set_ylabel("Mean", fontsize=14)
      plt.show()
  
  elif type == 'changes in grades':
    TITLE_X = name_of_axe(X)
    G = get_grades(data)

    if HUE is not None:
      TITLE_HUE = name_of_axe(HUE)

      sns.set_theme(style="whitegrid", palette=PALETTE)
      ax = sns.barplot(x=X, y=Y, hue=HUE, data=G)
      ax.axes.set_title('Average score changing', fontsize=16)
      ax.set_xlabel(TITLE_X, fontsize=14)
      ax.set_ylabel("Average score",fontsize=14)
      plt.gca().legend(bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure,
                      loc='upper left').set_title(TITLE_HUE)
      plt.show()

    else:
      sns.set_theme(style="whitegrid", palette=PALETTE)
      ax = sns.barplot(x=X, y=Y, data=G)
      ax.axes.set_title('Average score changing', fontsize=16)
      ax.set_xlabel(TITLE_X,fontsize=14)
      ax.set_ylabel("Average score", fontsize=14)
      plt.show()


In [None]:
def make_countplot(data, X, HUE, PALETTE):
  TITLE_X = name_of_axe(X)
  if HUE is not None:
    TITLE_HUE = name_of_axe(HUE)

    sns.set_theme(style="whitegrid", palette=PALETTE)
    b = sns.countplot(x=X, hue=HUE, data=data)
    b.set_xlabel(TITLE_X ,fontsize=14)
    b.set_ylabel("Count",fontsize=14)
    plt.gca().legend(bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure,
                    loc='upper left').set_title(TITLE_HUE)
    plt.show()

  else:
    sns.set_theme(style="whitegrid", palette=PALETTE)
    b = sns.countplot(x=X, data=data)
    b.set_xlabel(TITLE_X ,fontsize=14)
    b.set_ylabel("Count",fontsize=14)
    plt.show()

In [None]:
# a set of palettes used for graphs and charts. You can complete it with other palettes 
PALETTES = ['gray_r', 'crest', 'magma', 'viridis', 'rocket_r', 'rocket', 'mako', 'gray', 'Greens']

In [None]:
X = 'sex'
Y = 'G3'
HUE = 'romantic'
PALETTE = PALETTES[3]

make_barplot(data, X, Y, HUE, PALETTE, 'typical')

In [None]:
X = 'traveltime'
HUE = 'address' 
PALETTE = PALETTES[5] 

make_countplot(data, X, HUE, PALETTE)

In [None]:
X = 'semester'
Y = 'grade'
HUE = 'school' 
PALETTE = PALETTES[2] 

make_barplot(data, X, Y, HUE, PALETTE, 'changes in grades')

# **Predicting grades**

In [None]:
def calculate_metrics(y_true, y_pred, type):
  '''
  Function for calculating metrics of regression model
  '''
  metrics = {'R^2' : r2_score(y_true, y_pred),
             'Explained Variance Score' : explained_variance_score(y_true, y_pred),
             'Max Error' : max_error(y_true, y_pred),
             'Mean Absolute Error' : mean_absolute_error(y_true, y_pred)
             }
  if type == 'train':
    print("METRICS ON TRAIN SET")
  elif type == 'val':
    print("METRICS ON VALIDATION SET")
  elif type == 'test':
    print("METRICS ON TEST SET")
  return(metrics)
    

In [None]:
def train_val_test_split(X, y, test_size, random_state):
  '''
  Function for splitting dataset into train, validation and test samples
  '''
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size, random_state=random_state)

  return X_train, X_val, X_test, y_train, y_val, y_test

# **Predicting grades**

There are a lot of categorical features in the data. The most popular method for encoding them is One-Hot Encoding (OHE). But OHE has its disadvantages so we suggest using CatBoost regressor for predicting students' grades. CatBoost is an algorithm for gradient boosting on decision trees that does not require pre-processing data. It also allows using GPU to train models.

# **Predicting grades for the 1st semester**

In [None]:
PALETTE = PALETTES[0] 

b = sns.countplot(x=data0['G1'], palette=PALETTE)
sns.set_theme(style="whitegrid")
b.set_xlabel('Grade', fontsize = 14)
b.set_ylabel('Number of students', fontsize = 14)
plt.show()

In [None]:
data1 = data0.drop(columns=['G2', 'G3'])

X1 = data1.drop(columns=['G1'])
y1 = data1['G1']

X_train1, X_val1, X_test1, y_train1, y_val1, y_test1 = train_val_test_split(X1, y1, 0.15, 8)

In [None]:
len(X_train1), len(X_val1), len(X_test1)

In [None]:
model1 = CatBoostRegressor(iterations=100,
                          learning_rate=0.1,
                          depth=3, 
                          l2_leaf_reg=5, 
                          cat_features=cat_features,
                          task_type='GPU',
                          random_state=8,
                          verbose=0)

model1.fit(X_train1, y_train1)
None

In [None]:
y_pred_train1 = model1.predict(X_train1)
calculate_metrics(y_train1, y_pred_train1, 'train')

In [None]:
y_pred_val1 = model1.predict(X_val1)
calculate_metrics(y_val1, y_pred_val1, 'val')

In [None]:
y_pred1 = model1.predict(X_test1)
calculate_metrics(y_test1, y_pred1, 'test')

# **Predicting grades for the 2st semester (G2)**

In [None]:
data0 = data0[(data0['G2'] > 0)]

In [None]:
PALETTE = PALETTES[0] 

b = sns.countplot(x=data0['G2'], palette=PALETTE)
sns.set_theme(style="whitegrid")
b.set_xlabel('Grade', fontsize = 14)
b.set_ylabel('Number of students', fontsize = 14)
plt.show()

In [None]:
data2 = data0.drop(columns=['G3'])

X2 = data2.drop(columns=['G2'])
y2 = data2['G2']

X_train2, X_val2, X_test2, y_train2, y_val2, y_test2 = train_val_test_split(X2, y2, 0.15, 8)

In [None]:
model2 = CatBoostRegressor(iterations=100,
                          learning_rate=0.1,
                          depth=3, 
                          l2_leaf_reg=13, 
                          cat_features=cat_features,
                          task_type='GPU',
                          random_state=8,
                          verbose=0)

model2.fit(X_train2, y_train2)
None

In [None]:
y_pred_train2 = model2.predict(X_train2)
calculate_metrics(y_train2, y_pred_train2, 'train')

In [None]:
y_pred_val2 = model2.predict(X_val2)
calculate_metrics(y_val2, y_pred_val2, 'val')

In [None]:
y_pred2 = model2.predict(X_test2)
calculate_metrics(y_test2, y_pred2, 'test')

# **Predicting final grades (G3)**

In [None]:
data0 = data0[(data0['G3'] > 0)]

In [None]:
PALETTE = PALETTES[0] 

b = sns.countplot(x=data0['G3'], palette=PALETTE)
sns.set_theme(style="whitegrid")
b.set_xlabel('Grade', fontsize = 14)
b.set_ylabel('Number of students', fontsize = 14)
plt.show()

In [None]:
X3 = data0.drop(columns=['G3'])
y3 = data0['G3']

X_train3, X_val3, X_test3, y_train3, y_val3, y_test3 = train_val_test_split(X3, y3, 0.15, 8)

In [None]:
model3 = CatBoostRegressor(iterations=100,
                          learning_rate=0.1,
                          depth=3, 
                          l2_leaf_reg=13, 
                          cat_features=cat_features,
                          task_type='GPU',
                          random_state=8,
                          verbose=0)

model3.fit(X_train3, y_train3)
None

In [None]:
y_pred_train3 = model3.predict(X_train3)
calculate_metrics(y_train3, y_pred_train3, 'train')

In [None]:
y_pred_val3 = model3.predict(X_val3)
calculate_metrics(y_val3, y_pred_val3, 'val')

In [None]:
y_pred3 = model3.predict(X_test3)
calculate_metrics(y_test3, y_pred3, 'test')

# **Conclusion**

Testing the developed model showed it is quite difficult to predict a student's grade in the first semester without having data on students' previous grades. However, as the model includes the scores from previous semesters, the quality of the model increases: the coefficient of determination was about 0.93 for the final score.

The developed model allows to predict the grades of students and to use this forecast to early identify the "risk group" that needs additional help in mastering the educational program.