# Machine Learning Test: Question 2

## 1. Load dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("data_Q2.csv")
df.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_42,Feature_43,Feature_44,Feature_45,Feature_46,Feature_47,Feature_48,Feature_49,Feature_50,Target
0,0.222105,0.199895,-0.166056,-0.149451,0.753769,0.678392,-0.701898,-0.631708,-2.081954,-1.873759,...,-0.707835,0.699871,0.629884,-0.112832,-0.101549,0.105162,0.094646,-0.057244,-0.300341,131.033839
1,0.830571,0.747514,-0.155013,-0.139512,-0.825304,-0.742773,-0.068017,-0.061216,-0.05023,-0.045207,...,2.006139,-1.141367,-1.02723,-0.514554,-0.463098,0.740684,0.666616,-0.720188,-0.399939,2.080009
2,-0.963353,-0.867018,-1.11314,-1.001826,0.552067,0.49686,-0.85609,-0.770481,0.386812,0.348131,...,0.573222,-0.466127,-0.419514,-0.82158,-0.739422,-1.554154,-1.398739,0.856455,-0.13711,154.275526
3,2.149843,1.934859,-0.43722,-0.393498,1.178275,1.060447,0.626042,0.563437,-0.596114,-0.536502,...,-0.138886,0.319233,0.28731,0.504989,0.45449,-0.584365,-0.525928,-0.756766,-1.260378,-7.755839
4,-0.537291,-0.483562,-0.491454,-0.442308,-0.987731,-0.888958,0.315882,0.284294,-0.791415,-0.712274,...,-1.644922,-1.414067,-1.27266,1.412016,1.270814,1.34936,1.214424,-0.91561,0.183969,-80.671954


## 2. Data Preprocessing

### 2.1 Split features and target

In [2]:
X = df.drop('Target', axis=1)
y = df['Target']
X.shape, y.shape

((1000, 50), (1000,))

### 2.2 Split training and test set

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 50), (200, 50), (800,), (200,))

### 2.3 Features scaling

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 3. Normal Model Training and Testing

### 3.1 Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

mse_lr = mean_squared_error(y_test, y_pred_lr)
print(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(r2_lr)

24040.366305478885
0.0801456586429572


## 4. Regularization

### 4.1 Hyperparameters Tuning: Ridge Regression

In [6]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

param_grid_Ridge = [{'alpha': [0.01, 0.1,  1.0, 10.0]}]

grid_search_Ridge = GridSearchCV(Ridge(), param_grid_Ridge, cv=5, scoring='neg_mean_squared_error')
grid_search_Ridge.fit(X_train, y_train)
grid_search_Ridge.best_params_

{'alpha': 10.0}

### 4.2 Hyperparameters Tuning: Lasso Regression

In [7]:
from sklearn.linear_model import Lasso

param_grid_Lasso = [{'alpha': [0.01, 0.1,  1.0, 10.0]}]

grid_search_Lasso = GridSearchCV(Lasso(), param_grid_Lasso, cv=5, scoring='neg_mean_squared_error')
grid_search_Lasso.fit(X_train, y_train)
grid_search_Lasso.best_params_

{'alpha': 10.0}

### 4.3 Hyperparameters Tuning: ElasticNet

In [8]:
from sklearn.linear_model import ElasticNet

param_grid_EN = [{'alpha': [0.01, 0.1, 1.0, 10.0], 'l1_ratio': [0.1, 0.5, 0.9]}]

grid_search_EN = GridSearchCV(ElasticNet(), param_grid_EN, cv=5, scoring='neg_mean_squared_error')
grid_search_EN.fit(X_train, y_train)
grid_search_EN.best_params_

{'alpha': 1.0, 'l1_ratio': 0.9}

## 5. Regularized Model Training and Testing

### 5.1 Ridge Regression

In [9]:
ridge = Ridge(alpha=10)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(mse_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(r2_ridge)

16875.235645213736
0.35430439900845934


### 5.2 Lasso Regression

In [10]:
lasso = Lasso(alpha=10)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(r2_lasso)

16009.803339177077
0.3874183563310244


### 5.3 ElasticNet

In [11]:
EN = ElasticNet(alpha = 1.0, l1_ratio = 0.9)
EN.fit(X_train, y_train)
y_pred_EN = EN.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

mse_EN = mean_squared_error(y_test, y_pred_EN)
print(mse_EN)
r2_EN = r2_score(y_test, y_pred_EN)
print(r2_EN)

16651.543160428955
0.3628635241334093


## 6. Result Analysis

In [12]:
print('MSE for Linear Regression:', mse_lr)
print('MSE for Ridge:', mse_ridge)
print('MSE for Lasso:', mse_lasso)
print('MSE for Elastic Net:', mse_EN)

MSE for Linear Regression: 24040.366305478885
MSE for Ridge: 16875.235645213736
MSE for Lasso: 16009.803339177077
MSE for Elastic Net: 16651.543160428955


Here we can see that Lasso achieve the best MSE result when tested using the test set, followed by ElasticNet, Ridge, and Linear Regression. This means the predicted value by the Lasso model is closer to the actual value compared to others.

In [13]:
print('R2 for Linear Regression:', r2_lr)
print('R2 for Ridge:', r2_ridge)
print('R2 for Lasso:', r2_lasso)
print('R2 for Elastic Net:', r2_EN)

R2 for Linear Regression: 0.0801456586429572
R2 for Ridge: 0.35430439900845934
R2 for Lasso: 0.3874183563310244
R2 for Elastic Net: 0.3628635241334093


Here we can see same results for R2. Lasso can fits the data well, explaining a larger portion of variability.