In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression
from sklearn import svm
import matplotlib.pyplot as plt
from pathlib import Path

Load the data

In [3]:
df = pd.read_csv('data/case1Data.csv')

#Missing Data Nan values, replace with mean or median
df = df.fillna(df.mean())

#Split data into X and Y
Y = df['y']
X = df.drop(columns=['y'])

#Convert to NumPy arrays
X = X.to_numpy()      
Y = Y.to_numpy()      



print("X shape:", X.shape)
print("y shape:", Y.shape)

X shape: (100, 100)
y shape: (100,)


In [4]:
def centerData(data):
    
    mu = np.mean(data,axis=0)
    data = data - mu
    
    return data, mu

def normalize(X):
    '''
    Function for normalizing the columns (variables) of a data matrix to unit length.
    Returns the normalized data and the L2 norm of the variables 
    
    Input  (X) --------> The data matrix to be normalized 
    Output (X_pre)-----> The normalized data matrix 
    Output (d) --------> Array with the L2 norms of the variables 
    '''
    d = np.linalg.norm(X,axis=0,ord=2)  # d is the euclidian lenghts of the variables 
    d[d==0]=1                           # Avoid dividing by zero if column L2 norm is zero 
    X_pre = X / d                       # Normalize the data with the euclidian lengths
    return X_pre,d                      # Return normalized data and the euclidian lengths



Ridge regression

In [5]:
#Center data
X, mu = centerData(X)
#Normalize data
X, d = normalize(X)
#Center Y
Y, mu_y = centerData(Y)

# X, Y are numeric and you want to do regression
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Create a Ridge model
model = Ridge(alpha=1.0)

# Train model
model.fit(X_train, Y_train)

# Evaluate model (R^2 by default for .score with regressors)
score = model.score(X_test, Y_test)
print('Ridge Regression R^2 Score:', score)

error = np.mean((model.predict(X_test) - Y_test) ** 2)
print('Mean Squared Error:', error)


Ridge Regression R^2 Score: 0.5327041988555337
Mean Squared Error: 2637.4713007816817


In [6]:
#OLS Regression
model = LinearRegression()

# Train model
model.fit(X_train, Y_train)

# Evaluate model (R^2 by default for .score with regressors)
score = model.score(X_test, Y_test)
print('OLS Regression R^2 Score:', score)

error = np.mean((model.predict(X_test) - Y_test) ** 2)
print('Mean Squared Error:', error)


OLS Regression R^2 Score: 0.6336173161330498
Mean Squared Error: 2067.9060488791038


In [7]:
# Lasso Regression
model = Lasso(alpha=1.0)

# Train model
model.fit(X_train, Y_train)

# Evaluate model (R^2 by default for .score with regressors)
score = model.score(X_test, Y_test)
print('Lasso Regression R^2 Score:', score)

error = np.mean((model.predict(X_test) - Y_test) ** 2)
print('Mean Squared Error:', error)

Lasso Regression R^2 Score: 0.5849303950312543
Mean Squared Error: 2342.700636835843
