In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This dataset was published by:
P. Cortez and A. Silva. Using Data Mining to Predict Secondary School Student Performance. In A. Brito and J. Teixeira Eds., Proceedings of 5th FUture BUsiness TEChnology Conference (FUBUTEC 2008) pp. 5-12, Porto, Portugal, April, 2008, EUROSIS, ISBN 978-9077381-39-7.

In [None]:
data = pd.read_csv("/kaggle/input/student-performance-data-set/student-por.csv")
data.head()

In [None]:
data.columns

In [None]:
data.shape

# Attribute Information:


* 1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
* 2 sex - student's sex (binary: 'F' - female or 'M' - male)
* 3 age - student's age (numeric: from 15 to 22)
* 4 address - student's home address type (binary: 'U' - urban or 'R' - rural)
* 5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
* 6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)
* 7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)
* 8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)
* 9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
* 10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
* 11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
* 12 guardian - student's guardian (nominal: 'mother', 'father' or 'other')
* 13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
* 14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
* 15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)
* 16 schoolsup - extra educational support (binary: yes or no)
* 17 famsup - family educational support (binary: yes or no)
* 18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
* 19 activities - extra-curricular activities (binary: yes or no)
* 20 nursery - attended nursery school (binary: yes or no)
* 21 higher - wants to take higher education (binary: yes or no)
* 22 internet - Internet access at home (binary: yes or no)
* 23 romantic - with a romantic relationship (binary: yes or no)
* 24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
* 25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
* 26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
* 27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
* 28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
* 29 health - current health status (numeric: from 1 - very bad to 5 - very good)
* 30 absences - number of school absences (numeric: from 0 to 93)

 
* 31 G1 - first period grade (numeric: from 0 to 20)
* 31 G2 - second period grade (numeric: from 0 to 20)
* 32 G3 - final grade (numeric: from 0 to 20, output target)



In [None]:
data.info()

There are no missing values, that makes the preparation of this dataset much easier. But still there are a lot of features that have to be properly encoded. Before I start to learn more about this dataset I'm going to put the test set aside.

In [None]:
data["school"].value_counts()

We have a small dataset, so I think that random sampling is not that good idea. I will use 'school' feature to separate the dataset into two strata and perform stratified sampling.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
for train_index, test_index in split.split(data, data["school"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

As we can see below the proportions of students from both schools are almost identical in the full set and in the test set. That means that our sample is representative.

In [None]:
strat_test_set["school"].value_counts()/len(strat_test_set)

In [None]:
data["school"].value_counts()/len(data)

Now I am going to work on the training set.

In [None]:
train = strat_train_set.copy()

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
numeric = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime','failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']

train[numeric].hist(figsize=(20,17))

Most so called numerical variables in this dataset are actually ordinal. Realy numerical are only 'age', 'absences', 'G1', 'G2' and 'G3'. If I'm going to use Gradient Descent to find parameters, than I have to standardize numerical values to make it work faster.

In [None]:
fig, ax = plt.subplots(figsize=(15,15))  
sns.heatmap(train.corr(), annot=True, vmin=-1, ax=ax)

G1 and G2 are highly correlated with G3. Ideally we should be able to predict G3 without G1 and G2, so I guess I have to test combinations of features to find more information, so that I don't need G1 and G2 that much to predict G3.

At first I want to encode all non-numerical features and look if there is more correlation. 

In [None]:
categorical = []

for column in train.columns:
    if column not in numeric:
        categorical.append(column)
        
categorical

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

cat = ColumnTransformer([('check', OrdinalEncoder(), categorical)],remainder='passthrough')

In [None]:
train_tr = cat.fit_transform(train)
train_tr = pd.DataFrame(train_tr, columns=(categorical + numeric))

In [None]:
fig, ax = plt.subplots(figsize=(25, 25))  
sns.heatmap(train_tr.corr(), annot=True, vmin=-1, ax=ax)

So we can get additional information from our categorical variables. Most correlation G3 has with interest in higher education, school, sex, address, internet access and mother's job. I will prepare a total pipeline to encode nominal varables and binary variables separately with OneHotEncoder and OrdinalEncoder. Otherwise we introduce some order to nominal categories, which can lead to mistakes of our model.

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
nominal = ['Mjob', 'Fjob', 'reason', 'guardian']
binary = []

for cat in categorical:
    if cat not in nominal:
        binary.append(cat)
binary

In [None]:
to_scale = ['age', 'absences', 'G1', 'G2']

train_x = train.drop(['G3'], axis=1)
train_labels = train['G3'].copy()

The full pipeline includes:
* OrdinalEncoder to encode binary categorical variables
* OneHotEncoder to encode nominal categorical variables
* StandardScaler to standardize numerical variables, not encoded ordinal variables though

In [None]:
full_pipeline = ColumnTransformer([("bin", OrdinalEncoder(), binary),
                                   ("cat", OneHotEncoder(), nominal),
                                  ("num", StandardScaler(), to_scale)], remainder='passthrough')

train_encoded = full_pipeline.fit_transform(train_x)

In [None]:
train_enc = pd.DataFrame(train_encoded)
train_enc.describe()

I want to compare different approaches to find parameters that minimize the cost function. The first approach is normal equation. We calculate the best possible vector of parameters directly for the global minimum (only minimum) of the cost function.


In [None]:
X = train_encoded
m = X.shape[0]

X_b = np.c_[np.ones((m, 1)), X]
X_b

In [None]:
y = train_labels
#best_theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
best_theta = np.linalg.pinv(X_b).dot(y)  #used pseudoinverse instead of inverse
best_theta

Let's make some predictions and look how good such a simple model can be.

In [None]:
test = strat_test_set

test_x = train.drop(['G3'], axis=1)
test_labels = train['G3'].copy()

In [None]:
test_encoded = full_pipeline.transform(test_x)
test_encoded

In [None]:
X_bT = np.c_[np.ones((m, 1)), test_encoded]
X_bT

In [None]:
y_predict = X_bT.dot(best_theta)
y_predict

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test_labels, y_predict)
rmse = np.sqrt(mse)
rmse

Next I've used LinearRegression model from sklearn, it implements SVD

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin = LinearRegression()
lin.fit(train_encoded, train_labels)

In [None]:
lin.intercept_, lin.coef_

In [None]:
predictions = lin.predict(test_encoded)

In [None]:
mse_lin = mean_squared_error(test_labels, predictions)
rmse_lin = np.sqrt(mse_lin)
rmse_lin

The RMSE of both models is almost identical. Now I want to try out the Batch Gradient Descent. It's result should be the same as the previous two.

In [None]:
alpha = 0.01 # learning rate
n_iterations = 20000
n = X_b.shape[1]

In [None]:
y = y.to_numpy().reshape(519,1)

def gradient_descent(alpha, x, y, numIterations):
    m = x.shape[0] # number of samples
    theta = np.random.randn(n,1)
    x_transpose = x.transpose()
    for iter in range(0, numIterations):
        hypothesis = np.dot(x, theta)
        loss = hypothesis - y
        
        gradient = np.dot(x_transpose, loss) / m         
        theta = theta - alpha * gradient  # update
    return theta

In [None]:
theta = gradient_descent(alpha, X_b, y, n_iterations)
theta

In [None]:
y_predict = X_bT.dot(theta)

mse_gd = mean_squared_error(test_labels, y_predict)
rmse_gd = np.sqrt(mse_gd)
rmse_gd

In case of Linear Regression all three approaches give us the same result. Even though the LinearRegression model from sklearn is a lot easier to use.

I hope this notebook can be interesting to anyone, I'm happy to hear how I could improve this project and get more interesting insights:)