In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

sns.set_theme()
%matplotlib inline

# Dataset

This data approach student achievement in secondary education of two Portuguese schools. The data attributes include student grades, demographic, social and school-related features) and it was collected by using school reports and questionnaires. Two datasets are provided regarding the performance in two distinct subjects: Mathematics (mat) and Portuguese language (por). In [Cortez and Silva, 2008], the two datasets were modeled under binary/five-level classification and regression tasks.

### Attribute Information:

- sex - student's sex (binary: 'F' - female or 'M' - male)
- age - student's age (numeric: from 15 to 22)
- famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
- Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)
- Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)
- Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)
- Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
- Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
- reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
- traveltime - home to school travel time (numeric: 1 - 1 hour)
- studytime - weekly study time (numeric: 1 - 10 hours)
- failures - number of past class failures (numeric: n if 1<=n<3, else 4)
- schoolsup - extra educational support (binary: yes or no)
- famsup - family educational support (binary: yes or no)
- paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
- activities - extra-curricular activities (binary: yes or no)
- internet - Internet access at home (binary: yes or no)
- romantic - with a romantic relationship (binary: yes or no)
- goout - going out with friends (numeric: from 1 - very low to 5 - very high)

In [None]:
df = pd.read_csv("../input/student-grade-prediction/student-mat.csv")

In [None]:
df.head()

In [None]:
df.columns

# Categorical Encoding

Linear Regression required that the attribute values be numerical. Therefore, columns with categorical data need to be encoded to a suitable numeric format. Attributes with 2 categories are encoded using `binary encoding` which converts the values to either `1 or 0`. Attributes with more than 2 categories are encoded using `one-hot encoding`.

In [None]:
binary = ["sex", "famsize", "Pstatus", "schoolsup", "famsup", "paid", "activities", "internet", "romantic"]

In [None]:
multiple = ["Medu", "Fedu", "Fjob", "Mjob", "reason"]

In [None]:
def binary_encoder(dataset, col):
    dataset[col] = dataset[col].astype('category')
    dataset[col] = dataset[col].cat.codes
    dataset[col] = dataset[col].astype('int')

In [None]:
df = pd.get_dummies(df, columns=multiple, prefix=multiple)

In [None]:
for col in binary:
    binary_encoder(df, col)

### Drop all unnecessary columns

In [None]:
dataset = df.drop(["guardian", "nursery", "higher", "address", "school", "famrel", "freetime", "Dalc", "Walc", "health", "absences"], axis=1)

# Correlation between Attributes

The heatmap shows the correlation between different attributes. We can use it to find which attributes are highle correlated with the target label and select them whereas we can also drop the features that are highly correlated to other features

In [None]:
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(15,12))

ax = sns.heatmap(data=df.corr(), ax=ax, cmap="Blues")
ax.set_xlabel('Features',fontdict={"fontsize":16})
ax.set_ylabel('Features',fontdict={"fontsize":16})
ax.set_title('Correlation between different Features', loc="center", fontdict={"fontsize": 16, "fontweight":"bold"})

plt.savefig("heatmap.png", bbox_inches="tight")
plt.show()


From the above heatmap, columns `G1`, `G2` and `G3` are highly correlated to each other. The below plots show this correlation.

In [None]:
pairplot = sns.pairplot(dataset[["G1", "G2", "G3"]], palette="viridis")

plt.savefig("pairplot.png", bbox_inches="tight")
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,8))

ax[0] = sns.lineplot(x="G1", y="G3", data=dataset, palette="viridis", ax=ax[0])
ax[0].set_xlabel('G1',fontdict={"fontsize":16})
ax[0].set_ylabel('G3',fontdict={"fontsize":16})
ax[0].set_title('G3 vs G1', loc="center", fontdict={"fontsize": 16, "fontweight":"bold"})

ax[1] = sns.lineplot(x="G2", y="G3", data=dataset, palette="viridis", ax=ax[1])
ax[1].set_xlabel('G2',fontdict={"fontsize":16})
ax[1].set_ylabel('G3',fontdict={"fontsize":16})
ax[1].set_title('G3 vs G2', loc="center", fontdict={"fontsize": 16, "fontweight":"bold"})

plt.savefig("lineplot.png", bbox_inches="tight")
plt.show()

# Prepare Data for Training

- Separate the target column from the features
- Scale the features for faster training
- Split data into training and test sets

In [None]:
x_cols = dataset.drop("G3", axis=1).columns

In [None]:
X = dataset[x_cols]
y = dataset["G3"]

In [None]:
sc = StandardScaler(with_mean=True, with_std=True)
X = pd.DataFrame(sc.fit_transform(X), columns=x_cols)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=69)

# Linear Regression

Linear regression is a technique where a straight line is used to model the relationship between input and output values. In more than two dimensions, this straight line may be thought of as a plane or hyperplane.

Predictions are made as a combination of the input values to predict the output value.

# Gradient Descent

Gradient Descent is the process of minimizing the cost function by following the gradients of the cost function. On every iteration, the derivative of the cost function is computed and minimized by changing the values of the parameters. This is done until the algorithm converges to the best-fit straight line.

### Notations

- $\theta$ - Weights Vector
- $h$ - Hypothesis
- $X$ - Feature Vector
- $m$ - Number of training examples
- $\alpha$ - Learning Rate

### Hypothesis Representation

1. Univariate Hypothesis

![alt](https://i.imgur.com/jp5OpXK.png)

2. Multivariate Hypothesis

![alt](https://i.imgur.com/j0FXWJV.png)

### Cost Function

The function used is the mean squared error function.

![alt](https://i.imgur.com/JUzmYVm.png)

### Gradient Descent Update

The updates to $\theta$ need to be made simultaneously

![alt](https://i.imgur.com/1PlWQoc.png)

![alt](https://i.imgur.com/1Q3IQdY.png)

In [None]:
# Vectorised Gradient Descent
def gradient_descent(X, y, m, theta, alpha, iterations):
    J = [] # List to store cost of every iteration
    for _ in range(iterations):
        h_theta = np.dot(X,theta)
        cost = np.sum((h_theta - y) ** 2) / (2*m)
        J.append(cost)
        theta = theta - alpha * np.dot(X.T, (h_theta - y)) / m
    return J, theta

In [None]:
def linear_regression(X, y, alpha, iterations):
    # Add theta_0 column
    X = pd.concat([X, pd.Series(1, index=X.index, name="x_0")], axis=1)
    m, n = X.shape
    theta = np.zeros(n)
    return gradient_descent(X, y, m, theta, alpha, iterations)

In [None]:
def accuracy(X, y, theta):
    X = pd.concat([X, pd.Series(1, index=X.index, name="x_0")], axis=1)
    y_preds = np.dot(X, theta)
    return r2_score(y, y_preds) * 100

# Univariate Linear Regression

In univariate linear regression, only one feature is used to predict the target value. Below, gradient descent is run for 100 iterations.


In [None]:
J, theta = linear_regression(X_train["G1"] + X_train["G2"], y_train, alpha=0.3, iterations=100)
score = accuracy(X_test["G1"] + X_test["G2"], y_test, theta)
print(f"Accuracy - {score}")

# Multivariate Linear Regression

In multivariate linear regression, more than 1 feature is used to predict the target value. Below all features have been used and algorithm is run for 100 iterations. There is an improvement in accuracy after taking all the features.

In [None]:
J, theta = linear_regression(X_train, y_train, alpha=0.3, iterations=100)
score = accuracy(X_test, y_test, theta)
print(f"Accuracy - {score}")

### Cost vs Number of Iterations

Below is the plot for the Cost vs Number of iterations. From the plot, after every iteration the cost function output decreases untill it becomes a constant at which point gradient descent has converged on the best-fit straight line.

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15,8))
ax = sns.lineplot(x=np.arange(1, 21), y=J[:20], palette="viridis", ax=ax)
ax.set_xlabel("Number of Iterations", fontdict={"fontsize":16})
ax.set_ylabel("Cost", fontdict={"fontsize":16})
ax.set_title("Cost vs Number of Iterations", fontdict={"fontsize":16})

plt.savefig("cost_vs_iter.png", bbox_inches="tight")

plt.show()

## Learning Rate

The learning rate controls how big of a step gradient descent takes in the direction of the minima.

- If $\alpha$ is too small, gradient descent takes small steps and a long time to converge to the minima
- If $\alpha$ is too large, gradient descent overshoots and start diverging

Below plot shows the effect of different values of $\alpha$ on gradient descent cost

In [None]:
J_1, theta = linear_regression(X_train, y_train, alpha=0.01, iterations=20)
score = accuracy(X_test, y_test, theta)

J_2, theta = linear_regression(X_train, y_train, alpha=0.03, iterations=20)
score = accuracy(X_test, y_test, theta)

J_3, theta = linear_regression(X_train, y_train, alpha=0.1, iterations=20)
score = accuracy(X_test, y_test, theta)

J_4, theta = linear_regression(X_train, y_train, alpha=0.3, iterations=20)
score = accuracy(X_test, y_test, theta)

In [None]:
costs = pd.DataFrame({
    "Iterations": np.arange(0,20),
    "J_1": J_1,
    "J_2": J_2,
    "J_3": J_3,
    "J_4": J_4,
})

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12,8))
ax = sns.lineplot(x='Iterations', y='value', hue='variable', data=pd.melt(costs, ['Iterations']),  ax=ax)
ax.set_xlabel("Number of Iterations", fontdict={"fontsize":16})
ax.set_ylabel("Cost", fontdict={"fontsize":16})
ax.set_title("Cost vs Number of Iterations", fontdict={"fontsize":16})

plt.savefig("alpha.png", bbox_inches="tight")
plt.show()

# Polynomial Linear Regression

In polynomial regression, the hypothesis function is a polynomial function of the features. This allows fitting more complex functions to the data.

Hypothesis Representation

$h = \theta _0 + \theta _1  X _1^2 + \theta _2  X _2^3$

In [None]:
X["G1"] = X["G1"] ** 2
X["G2"] = X["G2"] ** 3

sc = StandardScaler(with_mean=True, with_std=True)
X = pd.DataFrame(sc.fit_transform(X), columns=x_cols)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=69)
J, theta = linear_regression(X_train, y_train, alpha=0.3, iterations=100)
score = accuracy(X_test, y_test, theta)
print(f"Accuracy - {score}")