In [11]:
import pandas as pd
import numpy as np

In [10]:
def sigmoid(z):
    """Compute the sigmoid function."""
    return 1 / (1 + np.exp(-z))

def logistic_regression(X, y, num_iterations, learning_rate):
    """
    Train a logistic regression model on the given dataset.

    Args:
        X: A numpy array of shape (m, n) containing the input features.
        y: A numpy array of shape (m, 1) containing the target labels.
        num_iterations: An integer specifying the number of iterations to run gradient descent.
        learning_rate: A float specifying the learning rate for gradient descent.

    Returns:
        w: A numpy array of shape (n, 1) containing the learned weights.
        b: A float containing the learned bias term.
    """
    # Initialize the parameters
    m, n = X.shape
    w = np.zeros((n, 1))
    b = 0

    # Run gradient descent
    for i in range(num_iterations):
        # Forward propagation
        z = np.dot(X, w) + b
        a = sigmoid(z)

        # Compute the cost function
        cost = (-1 / m) * np.sum(y * np.log(a) + (1 - y) * np.log(1 - a))
    
        # Backward propagation
        dz = a - y
        dw = (1 / m) * np.dot(X.T, dz)
        db = (1 / m) * np.sum(dz)

        # Update the parameters
        w = w - learning_rate * dw
        b =b - learning_rate * db

    return w, b


def predict(X,y,w, b):
    """
    Predict the target labels for a given set of input features using the learned weights and bias.

    Args:
        X: A numpy array of shape (m, n) containing the input features.
        w: A numpy array of shape (n, 1) containing the learned weights.
        b: A float containing the learned bias term.

    Returns:
        y_pred: A numpy array of shape (m, 1) containing the predicted target labels.
    """
    # Compute the linear combination of the input features and the learned weights
    z = np.dot(X, w) + b

    # Compute the sigmoid of z
    a = sigmoid(z)

    # Threshold the predicted values at 0.5
    y_pred = (a >= 0.5).astype(int)
    accuracy = np.mean(y_pred == y)

    return accuracy


def confusion_matrix(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tp, tn, fp, fn

def accuracy(tp, tn, fp, fn):
    return (tp + tn) / (tp + tn + fp + fn)

def precision(tp, tn, fp, fn):
    return tp / (tp + fp)

def recall(tp, tn, fp, fn):
    return tp / (tp + fn)

def f1_score(p,r):
    return 2 * (p * r) / (p + r)




3A.Design a Logistic Regression model to predict whether a student reported flu-like
symptoms in the past year i.e., Flu(y) using below 12 variables in the dataset as input
variables:
Vaccin, HndWshQual, HndWshFreq, SociDist, NoFaceContact, RespEttiqu, PersnDist,
HandSanit, Risk, Inefficacy, KnowlTrans, KnowlMgmt

In [5]:
# Read the Flu dataset
df = pd.read_csv('flu_data.csv')
df.dropna(inplace=True)
#X= df[['Risk']]
X= df[['Vaccin','HndWshQual','HndWshFreq','SociDist','NoFaceContact','RespEttiqu','PersnDist','HandSanit','Risk','Inefficacy','KnowlTrans','KnowlMgmt']].to_numpy()
y=df[['Flu']].to_numpy()
#fit logistic regression model to data
w,b= logistic_regression(X, y, num_iterations=3000, learning_rate=0.1)
accuracy=predict(X,y,w,b)
print(accuracy)

3B.Design a Logistic Regression model to predict whether a student reported flu-like
symptoms in the past year i.e., Flu(y) using forward selection to select most significant
variables in the dataset as input variables. Which subset of features gave you the best
performance? What are your thoughts on these features getting selected?

In [31]:
def forward_selection(X, y, num_iterations, learning_rate):
    """
    Perform forward selection to select the most significant features for logistic regression.

    Args:
        X: A numpy array of shape (m, n) containing the input features.
        y: A numpy array of shape (m, 1) containing the target labels.
        num_iterations: An integer specifying the number of iterations to run gradient descent.
        learning_rate: A float specifying the learning rate for gradient descent.

    Returns:
        selected_features: A list of the most significant features selected by forward selection.
    """
    # Initialize the list of selected features and remaining features
    m, n = X.shape
    selected_features = []
    remaining_features = list(range(n))

    # Run forward selection
    for i in range(n):
        best_feature = None
        best_score = float('-inf')

        # Evaluate each remaining feature
        for j in remaining_features:
            # Train a logistic regression model with the selected features and the j-th feature
            features = selected_features + [j]
            X_train = X[:, features]
            w, b = logistic_regression(X_train, y, num_iterations, learning_rate)

            # Compute the accuracy of the model using cross-validation or a holdout set
            # Here, we'll just compute the accuracy on the training set for simplicity
            z = np.dot(X_train, w) + b
            predictions = sigmoid(z) >= 0.5
            accuracy = np.mean(predictions == y)
        
            # If the accuracy is better than the current best, update the best feature
            if accuracy > best_score:
                best_feature = j
                best_score = accuracy
                
        # Add the best feature to the selected features and remove it from the remaining features
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
        print(selected_features)
        print(best_score)
    return selected_features


In [52]:
df = pd.read_csv('flu_data.csv')
df.dropna(inplace=True)
#X= df[['Risk']].to_numpy()
X= df[['Risk','HandSanit','HndWshFreq','SociDist','KnowlTrans','Vaccin','RespEttiqu','Inefficacy']].to_numpy()
y=df[['Flu']].to_numpy()
forward_selection(X,y,3000,0.1)

[0]
0.8148148148148148
[0, 1]
0.8233618233618234
[0, 1, 2]
0.8262108262108262
[0, 1, 2, 3]
0.8262108262108262
[0, 1, 2, 3, 4]
0.8262108262108262
[0, 1, 2, 3, 4, 5]
0.8262108262108262
[0, 1, 2, 3, 4, 5, 6]
0.8290598290598291
[0, 1, 2, 3, 4, 5, 6, 7]
0.8319088319088319


[0, 1, 2, 3, 4, 5, 6, 7]