In [4]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt


## Load MNIST dataset:

In [5]:
# Load the MNIST dataset using fetch_openml
mnist = fetch_openml('mnist_784', version=1)

# Extract the features (pixel values) and labels from the dataset
X = mnist.data.values.astype('float32')
y = mnist.target.values.astype('int64')

# Print the shape of the data arrays
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

  warn(


X shape: (70000, 784)
y shape: (70000,)


In [6]:
# Subset data to use only class 0 and class 1
X = X[(y == 0) | (y == 1)]
y = y[(y == 0) | (y == 1)]

## Standardize Dataset:

In [7]:
# Define a small epsilon value to add to the standard deviation to avoid division by zero
eps = 1e-8

# Calculate the standard deviation of each feature and replace any zero values with eps
std_dev = np.std(X, axis=0)
std_dev[std_dev == 0] = eps

# Normalize the data by subtracting the mean and dividing by the standard deviation
X = (X - np.mean(X, axis=0)) / std_dev

## Split Dataset:

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train, y_test = train_test_split(X , y ,test_size=0.2, random_state=42)

## Implement Logistic Regression with different optimizers and L1 values

In [9]:
class LogisticRegression:
    def __init__(self, lr=0.01, n_iters=1000 , l1=0 , batch_size = None, optimizer='sgd'):

      self.lr = lr
      self.n_iters = n_iters
      self.weights = None
      self.bias = None
      self.l1 = l1
      self.batch_size = batch_size
      self.optimizer = optimizer
      self.m_w = None  # for RMSProp and Adam optimizers
      self.v_w = None  # for Adam optimizer
      

    
    def fit(self, X, y):

      n_samples, n_features = X.shape
      
      # Initialize the weights and bias to zeros
      self.weights = np.zeros(n_features)
      self.bias = 0


      if self.optimizer == 'RMSprop':
        self.m_w = np.zeros(n_features)
      elif self.optimizer == 'Adam':
        self.m_w = np.zeros(n_features)
        self.v_w = np.zeros(n_features)
      
      if self.batch_size is None:
        self.batch_size = 1  # set batch_size to 1 for stochastic gradient descent

      n_batches = n_samples // self.batch_size

      
      # Gradient descent
      for i in range(self.n_iters):
        
        # Randomly sample a batch of examples
        indices = np.random.permutation(n_samples)
        X = X[indices]
        y = y[indices]

        for j in range(n_batches):
            start = j * self.batch_size
            end = start + self.batch_size
            x_batch = X[start:end]
            y_batch = y[start:end]

            # Calculate the linear model using dot product of weights and features, and add the bias
            linear_model = np.dot(x_batch , self.weights) + self.bias
            
            # Apply sigmoid function to get the predicted probabilities
            y_predicted = self._sigmoid(linear_model)
            
            # Calculate the gradient of the cost function using the predicted probabilities and true labels
            dw = (1/self.batch_size) * np.dot(x_batch.T, (y_predicted - y_batch))
            db = (1/self.batch_size) * np.sum(y_predicted - y_batch)
      
  
        # Add L1 regularization to the gradient
        if self.l1:
          dw += self.l1 * self.weights
        
          
        # Update the weights and bias using the selected optimizer
        if self.optimizer == 'sgd':

          self.weights -= self.lr * dw
          self.bias -= self.lr *db
        
        elif self.optimizer == 'RMSprop':
          
          # Update moving average of squared gradients
          self.m_w = 0.9 * self.m_w + 0.1 * (dw ** 2)
          
          # Update the weights and bias using RMSProp optimizer
          self.weights -= self.lr * dw / (np.sqrt(self.m_w) + 1e-8)
          self.bias -= self.lr * db

        elif self.optimizer == 'Adam':
          self.m_w = 0.9 * self.m_w + 0.1 * dw
          self.v_w = 0.999 * self.v_w + 0.001 * (dw ** 2)

          # Compute bias-corrected first and second moment estimates
          m_w_corrected = self.m_w / (1 - 0.9 ** (i+1))
          v_w_corrected = self.v_w / (1 - 0.999 ** (i+1))
          
          # Update the weights and bias using Adam optimizer
          self.weights -= self.lr * m_w_corrected / (np.sqrt(v_w_corrected) + 1e-8)
          self.bias -= self.lr * db
    
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        return np.where(y_predicted > 0.5, 1, 0)
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))


In [10]:
# Define the hyperparameters
lr = 0.001
batch_list =[32 , 64 , 128 ]
lambda_list =[1 , 0.1 , 0.001]

### Use L1 regularization with gradient descent optimizer. Try 2 values for lambda

In [None]:
for lam in lambda_list:

  # Initialize the model
  model = LogisticRegression(lr = lr, l1 = lam)
  # Train the model
  model.fit(X_train, y_train)

  # Evaluate the model on the training and testing sets
  train_acc = np.mean(model.predict(X_train) == y_train)
  test_acc = np.mean(model.predict(X_test) == y_test)

  # Print the results
  print(f"Lambda: {lam}, train accuracy: {train_acc:.4f}, test accuracy: {test_acc:.4f}")



Lambda: 1, train accuracy: 0.9966, test accuracy: 0.9976
Lambda: 0.1, train accuracy: 0.9959, test accuracy: 0.9976
Lambda: 0.001, train accuracy: 0.9964, test accuracy: 0.9970


### Use mini-batch gradient descent optimizer. Try multiple batches (at least 2)

In [None]:
for batch_size in batch_list:
  
  # Initialize the model
  model1 = LogisticRegression(lr = lr, batch_size = batch_size)
  # Train the model
  model1.fit(X_train, y_train)

  # Evaluate the model on the training and testing sets
  train_acc = np.mean(model1.predict(X_train) == y_train)
  test_acc = np.mean(model1.predict(X_test) == y_test)

  print(f"Batch size: {batch_size}, train accuracy: {train_acc:.4f}, test accuracy: {test_acc:.4f}")



Batch size: 32, train accuracy: 0.9988, test accuracy: 0.9990
Batch size: 64, train accuracy: 0.9987, test accuracy: 0.9990
Batch size: 128, train accuracy: 0.9987, test accuracy: 0.9990


### Use RMS Prop optimizer and Adam optimizer

In [None]:
for optimizer in ['RMSprop', 'Adam']:

  # Initialize the model
  model = LogisticRegression(lr = lr, optimizer = optimizer)

  # Train the model
  model.fit(X_train, y_train)

  # Evaluate the model on the training and testing sets
  train_acc = np.mean(model.predict(X_train) == y_train)
  test_acc = np.mean(model.predict(X_test) == y_test)

  # Print the results
  print(f"Optimizer: {optimizer}, train accuracy: {train_acc:.4f}, test accuracy: {test_acc:.4f}")

Optimizer: RMSprop, train accuracy: 0.9936, test accuracy: 0.9932
Optimizer: Adam, train accuracy: 0.9951, test accuracy: 0.9970


### Use RMS Prop optimizer and Adam optimizer using mini batches

In [None]:
batch_size1 = 64

for optimizer in ['RMSprop', 'Adam']:

  # Initialize the model
  model = LogisticRegression(lr = lr, optimizer = optimizer ,batch_size = batch_size1)

  # Train the model
  model.fit(X_train, y_train)

  # Evaluate the model on the training and testing sets
  train_acc = np.mean(model.predict(X_train) == y_train)
  test_acc = np.mean(model.predict(X_test) == y_test)

  # Print the results
  print(f"Optimizer: {optimizer}, train accuracy: {train_acc:.4f}, test accuracy: {test_acc:.4f}")

Optimizer: RMSprop, train accuracy: 0.9970, test accuracy: 0.9963
Optimizer: Adam, train accuracy: 0.9989, test accuracy: 0.9980


### Train and test the model using different optimizers and regularization parameters

In [None]:
# Define the hyperparameters
lambda1 = 0.01
lambda2 = 0.1

for optimizer in ['sgd', 'RMSprop', 'Adam']:
    for lam in [lambda1, lambda2]:
        # Initialize the model
        model = LogisticRegression(lr = lr, optimizer = optimizer, l1 = lam, batch_size = batch_size1)

        # Train the model
        model.fit(X_train, y_train)

        # Evaluate the model on the training and testing sets
        train_acc = np.mean(model.predict(X_train) == y_train)
        test_acc = np.mean(model.predict(X_test) == y_test)

        # Print the results
        print(f"Optimizer: {optimizer}, lambda: {lam}, train accuracy: {train_acc:.4f}, test accuracy: {test_acc:.4f}")

Optimizer: sgd, lambda: 0.01, train accuracy: 0.9967, test accuracy: 0.9976
Optimizer: sgd, lambda: 0.1, train accuracy: 0.9968, test accuracy: 0.9976
Optimizer: RMSprop, lambda: 0.01, train accuracy: 0.9989, test accuracy: 0.9997
Optimizer: RMSprop, lambda: 0.1, train accuracy: 0.9975, test accuracy: 0.9990
Optimizer: Adam, lambda: 0.01, train accuracy: 0.9990, test accuracy: 0.9997
Optimizer: Adam, lambda: 0.1, train accuracy: 0.9983, test accuracy: 0.9993


###  **Here's a summary of the accuracies for each case and a brief explanation:**

**L1 regularization with gradient descent optimizer:**

1. Lambda: 1, train accuracy: 0.9966, test accuracy: 0.9976

1. Lambda: 0.1, train accuracy: 0.9959, test accuracy: 0.9976

2. Lambda: 0.001, train accuracy: 0.9964, test accuracy: 0.9970



**Conclusion:** In L1 regularization, the hyperparameter lambda controls the strength of regularization. A higher lambda leads to stronger regularization, which can prevent overfitting but can also lead to underfitting. From the results above, we can see that a lower lambda value of 0.1 leads to a slightly better test accuracy than a lambda value of 1.



**Mini-batch gradient descent:**

1.  Batch size: 32, train accuracy: 0.9988, test accuracy: 0.9990

1.  Batch size: 64, train accuracy: 0.9987, test accuracy: 0.9990

2.  Batch size: 128, train accuracy: 0.9987, test accuracy: 0.9990








**Conclusion:** In mini-batch gradient descent, the batch size controls the number of examples used in each iteration of gradient descent. A larger batch size can provide a more accurate estimate of the gradient but can also increase the computational cost. From the results above, we can see that the batch size does not have a significant effect on the accuracy, with all batch sizes leading to high test accuracies.



**RMS Prop optimizer:**


1.  train accuracy: 0.9970, test accuracy: 0.9963

**Conclusion:** In RMS Prop optimizer, the learning rate is automatically adjusted based on the gradient magnitudes. This can lead to faster convergence and better performance, especially when the gradient magnitudes are highly variable. From the results above, we can see that RMS Prop performs worse than other optimizers in terms of accuracy.

**Adam optimizer:**

1.  train accuracy: 0.9989, test accuracy: 0.9980

**Conclusion:** In Adam optimizer, both the learning rate and the momentum are automatically adjusted based on the gradient magnitudes. This can provide even faster convergence and better performance than RMS Prop. From the results above, we can see that Adam performs better than other optimizers in terms of accuracy.