In [15]:
import pandas as pd
import numpy as np

4A.i For the best performing model in Q 3 (Model from 3c), does regularization improve
the performance?

In [27]:
def sigmoid(z):
    """Compute the sigmoid function."""
    return 1 / (1 + np.exp(-z))

def logistic_regression_with_regularization(X, y, num_iterations, learning_rate,lambd):
    """
    Train a logistic regression model on the given dataset.

    Args:
        X: A numpy array of shape (m, n) containing the input features.
        y: A numpy array of shape (m, 1) containing the target labels.
        num_iterations: An integer specifying the number of iterations to run gradient descent.
        learning_rate: A float specifying the learning rate for gradient descent.

    Returns:
        w: A numpy array of shape (n, 1) containing the learned weights.
        b: A float containing the learned bias term.
    """
    # Initialize the parameters
    m, n = X.shape
    w = np.zeros((n, 1))
    b = 0

    # Run gradient descent
    for i in range(num_iterations):
        # Forward propagation
        z = np.dot(X, w) + b
        a = sigmoid(z)

        # Compute the cost function with regularization
        cost = (-1 / m) * np.sum(y * np.log(a) + (1 - y) * np.log(1 - a)) + (lambd / (2 * m)) * np.sum(w ** 2)

        # Backward propagation
        dz = a - y
        dw = (1 / m) * np.dot(X.T, dz)
        db = (1 / m) * np.sum(dz)

        # Update the parameters
        w = w - learning_rate * dw
        b =b - learning_rate * db

    return w, b


In [3]:
def predict(X,y,w, b):
    """
    Predict the target labels for a given set of input features using the learned weights and bias.

    Args:
        X: A numpy array of shape (m, n) containing the input features.
        w: A numpy array of shape (n, 1) containing the learned weights.
        b: A float containing the learned bias term.

    Returns:
        y_pred: A numpy array of shape (m, 1) containing the predicted target labels.
    """
    # Compute the linear combination of the input features and the learned weights
    z = np.dot(X, w) + b

    # Compute the sigmoid of z
    a = sigmoid(z)

    # Threshold the predicted values at 0.5
    y_pred = (a >= 0.5).astype(int)
    accuracy = np.mean(y_pred == y)

    return accuracy


In [20]:
# Read the Flu dataset
df = pd.read_csv('flu_data.csv')
df.dropna(inplace=True)
#X= df[['Vaccin','HndWshQual','HndWshFreq','SociDist','NoFaceContact','RespEttiqu','PersnDist','HandSanit','Risk','Inefficacy','KnowlTrans','KnowlMgmt']].to_numpy()
X= df[['Risk','HandSanit','HndWshFreq','SociDist','KnowlTrans','Vaccin','RespEttiqu','Inefficacy']].to_numpy()

y=df[['Flu']].to_numpy()
#fit logistic regression model to data
w,b= logistic_regression_with_regularization(X, y, num_iterations=3000, learning_rate=0.1,lambd=100)
accuracy=predict(X,y,w,b)
print(accuracy)


0.8319088319088319


4A. ii Does Feature Scaling improve the performance for the model in Q 4a?

In [9]:
def standardize(X):
    """
    Standardize the input features using standardization.

    Args:
        X: A numpy array of shape (m, n) containing the input features.

    Returns:
        X_std: A numpy array of shape (m, n) containing the standardized input features.
        mean: A numpy array of shape (1, n) containing the mean of each input feature.
        std: A numpy array of shape (1, n) containing the standard deviation of each input feature.
    """
    # Compute the mean and standard deviation of each feature
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)

    # Standardize the features
    X_std = (X - mean) / std

    return X_std, mean, std


In [26]:
# Read the Flu dataset
df = pd.read_csv('flu_data.csv')
df.dropna(inplace=True)
#X= df[['Vaccin','HndWshQual','HndWshFreq','SociDist','NoFaceContact','RespEttiqu','PersnDist','HandSanit','Risk','Inefficacy','KnowlTrans','KnowlMgmt']].to_numpy()
X= df[['Risk','HandSanit','HndWshFreq','SociDist','KnowlTrans','Vaccin','RespEttiqu','Inefficacy']].to_numpy()
X,mean,std = standardize(X)
y=df[['Flu']].to_numpy()
#fit logistic regression model to data
w,b= logistic_regression_with_regularization(X, y, num_iterations=3000, learning_rate=0.1,lambd=100)
accuracy=predict(X,y,w,b)
print(accuracy)


0.8233618233618234


4B.i Keeping the best regularized (or not) model after the experiments from 4a, design
a Logistic Regression model to predict whether a student reported flu-like
symptoms in the past year i.e., Flu(y) by changing the cost function to the
following.


In [13]:
def logistic_regression_with_new_cf(X, y, num_iterations, learning_rate):
    """
    Train a logistic regression model on the given dataset.

    Args:
        X: A numpy array of shape (m, n) containing the input features.
        y: A numpy array of shape (m, 1) containing the target labels.
        num_iterations: An integer specifying the number of iterations to run gradient descent.
        learning_rate: A float specifying the learning rate for gradient descent.

    Returns:
        w: A numpy array of shape (n, 1) containing the learned weights.
        b: A float containing the learned bias term.
    """
    # Initialize the parameters
    m, n = X.shape
    w = np.zeros((n, 1))
    b = 0

    # Run gradient descent
    for i in range(num_iterations):
        # Forward propagation
        z = np.dot(X, w) + b
        a = sigmoid(z)

        # Compute the cost function with regularization
        cost = (1 / (2 * m)) * np.sum((a - y) ** 2)

        # Backward propagation
        dz = a - y
        dw = (1 / m) * np.dot(X.T, dz)
        db = (1 / m) * np.sum(dz)

        # Update the parameters
        w = w - learning_rate * dw
        b =b - learning_rate * db

    return w, b

In [28]:
# Read the Flu dataset
df = pd.read_csv('flu_data.csv')
df.dropna(inplace=True)
#X= df[['Vaccin','HndWshQual','HndWshFreq','SociDist','NoFaceContact','RespEttiqu','PersnDist','HandSanit','Risk','Inefficacy','KnowlTrans','KnowlMgmt']].to_numpy()
X= df[['Risk','HandSanit','HndWshFreq','SociDist','KnowlTrans','Vaccin','RespEttiqu','Inefficacy']].to_numpy()
y=df[['Flu']].to_numpy()
#fit logistic regression model to data
w,b= logistic_regression_with_new_cf(X, y, num_iterations=3000, learning_rate=0.1)
accuracy=predict(X,y,w,b)
print(accuracy)


0.8319088319088319
