<a href="https://colab.research.google.com/github/rajdeepbanerjee-git/JNCLectures_Intro_to_ML/blob/main/Week4/MLE_linear_logistic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings

# Suppress warnings globally
warnings.filterwarnings("ignore", category=RuntimeWarning)

#### Linear regression: using closed-form solution

In [12]:
# Load Boston-housing dataset
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]

In [13]:
# Add intercept term to X
X = np.hstack([np.ones((X.shape[0], 1)), X])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Closed-form solution: Normal Equation
def closed_form_solution(X, y):
    return np.linalg.inv(X.T @ X) @ X.T @ y

# Calculate beta
beta = closed_form_solution(X_train, y_train)
print("Closed-form solution coefficients:", beta)

# Predict on the test set
y_pred = X_test @ beta
mse = np.mean((y_pred - y_test)**2)
print("Closed-form solution MSE on test set:", mse)


Closed-form solution coefficients: [ 3.02467510e+01 -1.13055924e-01  3.01104641e-02  4.03807204e-02
  2.78443820e+00 -1.72026334e+01  4.43883520e+00 -6.29636221e-03
 -1.44786537e+00  2.62429736e-01 -1.06467863e-02 -9.15456240e-01
  1.23513347e-02 -5.08571424e-01]
Closed-form solution MSE on test set: 24.29111947497721


#### Linear regression using minimization library

In [14]:
from scipy.optimize import minimize

# Define the loss function (NLL equivalent for Linear Regression)
def linear_regression_loss(beta, X, y):
    residuals = y - X @ beta
    return np.sum(residuals**2)

# Initial guess for beta
beta_init = np.zeros(X_train.shape[1])

# Minimize the loss function
result = minimize(linear_regression_loss, beta_init, args=(X_train, y_train), method='BFGS')
beta_minimized = result.x

print("Minimized solution coefficients:", beta_minimized)

# Predict on the test set
y_pred_minimized = X_test @ beta_minimized
mse_minimized = np.mean((y_pred_minimized - y_test)**2)
print("Minimized solution MSE on test set:", mse_minimized)


Minimized solution coefficients: [ 3.02468647e+01 -1.13056177e-01  3.01108846e-02  4.03832801e-02
  2.78443277e+00 -1.72025688e+01  4.43883284e+00 -6.29650486e-03
 -1.44786643e+00  2.62435762e-01 -1.06472410e-02 -9.15454310e-01
  1.23511753e-02 -5.08571741e-01]
Minimized solution MSE on test set: 24.291070645990622


#### Logistic regression using minimization

In [15]:
from sklearn.datasets import load_iris

# Load Iris Dataset
iris = load_iris()
X = iris.data[:, 2:4]  # Use petal length and width
y = (iris.target == 0).astype(int)  # Binary classification: Setosa or not

# Add intercept term to X
X = np.hstack([np.ones((X.shape[0], 1)), X])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Negative Log-Likelihood (NLL)
def logistic_regression_loss(beta, X, y):
    logits = X @ beta
    return -np.sum(y * np.log(sigmoid(logits)) + (1 - y) * np.log(1 - sigmoid(logits)))

# Initial guess for beta
beta_init = np.zeros(X_train.shape[1])

# Minimize the NLL
result = minimize(logistic_regression_loss, beta_init, args=(X_train, y_train), method='BFGS')
beta_minimized = result.x

print("Logistic Regression coefficients (beta):", beta_minimized)

# Predictions on the test set
logits_test = X_test @ beta_minimized
y_pred_prob = sigmoid(logits_test)
y_pred = (y_pred_prob >= 0.5).astype(int)

# Evaluate accuracy
accuracy = np.mean(y_pred == y_test)
print("Logistic Regression accuracy on test set:", accuracy)


Logistic Regression coefficients (beta): [-1916.29003898 -2845.97318767  -161.79150982]
Logistic Regression accuracy on test set: 1.0
