In [None]:
import numpy as np
from google.colab import drive
import pandas as pd
import time 
drive.mount("/content/drive")

Mounted at /content/drive


# Reading features and labels from csv files

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Datasets/question-3-features-train.csv")
labels = pd.read_csv("/content/drive/MyDrive/Datasets/question-3-labels-train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/Datasets/question-3-features-test.csv")
test_labels = pd.read_csv("/content/drive/MyDrive/Datasets/question-3-labels-test.csv")

In [None]:
"# of data: {0}, # of features {1}".format(*train_data.shape)
"# of data: {0}, # of features {1}".format(*test_data.shape)

'# of data: 179, # of features 3'

Finding the rank of the features matrix

In [None]:
arr = train_data.to_numpy()
np.linalg.matrix_rank(arr.T.dot(arr))

3

# **Definition of Logistic Regression Module**

In [None]:
class logistic_regressor(object):
    def __init__(self, features, labels, alpha, num_epochs):
        self.feature_num = features.shape[1]
        self.features = self.normalize(features)
        self.labels = labels
        temp = pd.DataFrame(np.ones(features.shape[0]))
        temp = temp.join(self.features)
        self.feature_arr = temp.to_numpy()
        self.label_arr = labels.to_numpy()
        self.thetas = np.zeros((self.feature_num+1, 1))
        self.alpha = alpha
        self.epoch = num_epochs

    def normalize(self, frame):
        min_val = frame.min()
        max_val = frame.max()
        divisor = (max_val - min_val)
        frame = (frame - min_val) / divisor
        return frame

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self):
        for _ in range(self.epoch):
            scores = np.matmul(self.feature_arr, self.thetas)
            preds = self.sigmoid(scores)
            loss = self.label_arr - preds
            self.thetas += self.alpha * np.matmul(self.feature_arr.T, loss)
        return self.thetas
    
    def predict(self, data):
        if data.shape[0] != self.feature_num:
            data = data[1:]
        prediction = 0 + self.thetas[0]
        for i in range(self.features.shape[1]):
            prediction += self.thetas[i+1] * data[i]
        return int(prediction > 0)

    def score(self, x, y):
        confusion_matrix = np.zeros((2,2), dtype = int)
        for i in range(x.shape[0]):
            prediction = self.predict(x[i])
            truth = y[i]
            if prediction == 1 and truth == 1:
                confusion_matrix[0, 0] += 1
            elif prediction == 1 and truth == 0:
                confusion_matrix[1, 0] += 1
            elif prediction == 0 and truth == 1:
                confusion_matrix[0, 1] += 1
            else:
                confusion_matrix[1, 1] += 1
        return confusion_matrix
    
    def training_accuracy(self):
        matrix = self.score(self.feature_arr, self.label_arr)
        return (matrix[0, 0] + matrix[1, 1]) / np.sum(matrix)

    def fit_batch(self, x, y):
        scores = np.matmul(x, self.thetas)
        preds = self.sigmoid(scores)
        loss = y - preds
        self.thetas += self.alpha * np.matmul(x.T, loss)

    def mini_batch_train(self, m):
        data_num = self.feature_arr.shape[0]
        np.random.seed(31)
        self.thetas = np.random.rand(*tuple(self.thetas.shape))
        for _ in range(self.epoch):
            i = 0
            while i+m <= data_num:
                self.fit_batch(self.feature_arr[i:i+m, :], self.label_arr[i:i+m])
                i += m
            if i != data_num:
                self.fit_batch(self.feature_arr[i:data_num, :], self.label_arr[i:data_num])
                i = data_num
        return self.thetas

# 3.1

In [None]:
def f_score(B, P, R):
    return ((1 + B*B) * P * R) / ((B*B*P) + R)

In [None]:
test_data = model.normalize(test_data)

Defining and Fitting The Model

Alpha = 1e-5

In [None]:
model = logistic_regressor(train_data, labels, 1e-5, 1000) #pd.DataFrame(features["LSTAT"])
costs = model.fit()
model.thetas

array([[-0.16709404],
       [-0.50547407],
       [-0.10840616],
       [ 0.06074037]])

In [None]:
"%.4f"%model.training_accuracy()

'0.6166'

In [None]:
matrix = model.score(test_data.to_numpy(), test_labels.to_numpy())
print("Confusion Matrix:\n", matrix)
print("Accuracy: %.3f"%((matrix[0, 0] + matrix[1, 1]) / np.sum(matrix)))
precision = (matrix[0, 0] / (matrix[0, 0] + matrix[1, 0]))
print("Precision: %.3f"%precision)
recall = (matrix[0, 0] / (matrix[0, 0] + matrix[0, 1]))
print("Recall: %.3f"%recall)
print("Negative Predictive Value: %.3f"%(matrix[1, 1] / (matrix[1, 1] + matrix[0, 1])))
print("False Positive Rate: %.3f"%(matrix[0, 1] / (matrix[0, 1] + matrix[1, 1])))
print("F1 and F2 scores: %.3f, %.3f" % (f_score(1, precision, recall), f_score(2, precision, recall)))

Confusion Matrix:
 [[  0  69]
 [  0 110]]
Accuracy: 0.615
Precision: nan
Recall: 0.000
Negative Predictive Value: 0.615
False Positive Rate: 0.385
F1 and F2 scores: nan, nan


  after removing the cwd from sys.path.


Alpha = 1e-4

In [None]:
model = logistic_regressor(train_data, labels, 1e-4, 1000) #pd.DataFrame(features["LSTAT"])
costs = model.fit()
model.thetas

array([[ 0.57219593],
       [-1.45638612],
       [-0.48654093],
       [ 0.44683419]])

In [None]:
"%.4f"%model.training_accuracy()

'0.6840'

In [None]:
matrix = model.score(test_data.to_numpy(), test_labels.to_numpy())
print("Confusion Matrix:\n", matrix)
print("Accuracy: %.3f"%((matrix[0, 0] + matrix[1, 1]) / np.sum(matrix)))
precision = (matrix[0, 0] / (matrix[0, 0] + matrix[1, 0]))
print("Precision: %.3f"%precision)
recall = (matrix[0, 0] / (matrix[0, 0] + matrix[0, 1]))
print("Recall: %.3f"%recall)
print("Negative Predictive Value: %.3f"%(matrix[1, 1] / (matrix[1, 1] + matrix[0, 1])))
print("False Positive Rate: %.3f"%(matrix[0, 1] / (matrix[0, 1] + matrix[1, 1])))
print("F1 and F2 scores: %.3f, %.3f" % (f_score(1, precision, recall), f_score(2, precision, recall)))

Confusion Matrix:
 [[27 42]
 [19 91]]
Accuracy: 0.659
Precision: 0.587
Recall: 0.391
Negative Predictive Value: 0.684
False Positive Rate: 0.316
F1 and F2 scores: 0.470, 0.419


Alpha = 1e-3

In [None]:
model = logistic_regressor(train_data, labels, 1e-3, 1000) #pd.DataFrame(features["LSTAT"])
costs = model.fit()
model.thetas

array([[ 1.70102622],
       [-1.97755776],
       [-2.73013922],
       [ 1.28611756]])

In [None]:
"%.4f"%model.training_accuracy()

'0.7121'

In [None]:
matrix = model.score(test_data.to_numpy(), test_labels.to_numpy())
print("Confusion Matrix:\n", matrix)
print("Accuracy: %.3f"%((matrix[0, 0] + matrix[1, 1]) / np.sum(matrix)))
precision = (matrix[0, 0] / (matrix[0, 0] + matrix[1, 0]))
print("Precision: %.3f"%precision)
recall = (matrix[0, 0] / (matrix[0, 0] + matrix[0, 1]))
print("Recall: %.3f"%recall)
print("Negative Predictive Value: %.3f"%(matrix[1, 1] / (matrix[1, 1] + matrix[0, 1])))
print("False Positive Rate: %.3f"%(matrix[0, 1] / (matrix[0, 1] + matrix[1, 1])))
print("F1 and F2 scores: %.3f, %.3f" % (f_score(1, precision, recall), f_score(2, precision, recall)))

Confusion Matrix:
 [[31 38]
 [16 94]]
Accuracy: 0.698
Precision: 0.660
Recall: 0.449
Negative Predictive Value: 0.712
False Positive Rate: 0.288
F1 and F2 scores: 0.534, 0.480


Alpha = 1e-2

In [None]:
model = logistic_regressor(train_data, labels, 1e-2, 1000) #pd.DataFrame(features["LSTAT"])
costs = model.fit()
model.thetas

array([[ 2.61221239],
       [-1.99087256],
       [-3.21911101],
       [ 1.71843768]])

In [None]:
"%.4f"%model.training_accuracy()

'0.6601'

In [None]:
matrix = model.score(test_data.to_numpy(), test_labels.to_numpy())
print("Confusion Matrix:\n", matrix)
print("Accuracy: %.3f"%((matrix[0, 0] + matrix[1, 1]) / np.sum(matrix)))
precision = (matrix[0, 0] / (matrix[0, 0] + matrix[1, 0]))
print("Precision: %.3f"%precision)
recall = (matrix[0, 0] / (matrix[0, 0] + matrix[0, 1]))
print("Recall: %.3f"%recall)
print("Negative Predictive Value: %.3f"%(matrix[1, 1] / (matrix[1, 1] + matrix[0, 1])))
print("False Positive Rate: %.3f"%(matrix[0, 1] / (matrix[0, 1] + matrix[1, 1])))
print("F1 and F2 scores: %.3f, %.3f" % (f_score(1, precision, recall), f_score(2, precision, recall)))

Confusion Matrix:
 [[50 19]
 [34 76]]
Accuracy: 0.704
Precision: 0.595
Recall: 0.725
Negative Predictive Value: 0.800
False Positive Rate: 0.200
F1 and F2 scores: 0.654, 0.694


Alpha = 1e-1

In [None]:
model = logistic_regressor(train_data, labels, 1e-1, 1000) #pd.DataFrame(features["LSTAT"])
costs = model.fit()
model.thetas

array([[  0.05423731],
       [-40.58875457],
       [-38.83425394],
       [ 28.58597096]])

In [None]:
"%.4f"%model.training_accuracy()

'0.6264'

In [None]:
matrix = model.score(test_data.to_numpy(), test_labels.to_numpy())
print("Confusion Matrix:\n", matrix)
print("Accuracy: %.3f"%((matrix[0, 0] + matrix[1, 1]) / np.sum(matrix)))
precision = (matrix[0, 0] / (matrix[0, 0] + matrix[1, 0]))
print("Precision: %.3f"%precision)
recall = (matrix[0, 0] / (matrix[0, 0] + matrix[0, 1]))
print("Recall: %.3f"%recall)
print("Negative Predictive Value: %.3f"%(matrix[1, 1] / (matrix[1, 1] + matrix[0, 1])))
print("False Positive Rate: %.3f"%(matrix[0, 1] / (matrix[0, 1] + matrix[1, 1])))
print("F1 and F2 scores: %.3f, %.3f" % (f_score(1, precision, recall), f_score(2, precision, recall)))

Confusion Matrix:
 [[  5  64]
 [  2 108]]
Accuracy: 0.631
Precision: 0.714
Recall: 0.072
Negative Predictive Value: 0.628
False Positive Rate: 0.372
F1 and F2 scores: 0.132, 0.088


# 3.2

**Mini-batch with batch size 100**

In [None]:
model = logistic_regressor(train_data, labels, 1e-2, 1000)
model.mini_batch_train(100)

array([[ 1.78858485],
       [-1.97215461],
       [-2.94735654],
       [ 1.6090071 ]])

In [None]:
"%.4f"%model.training_accuracy()

'0.7051'

In [None]:
matrix = model.score(test_data.to_numpy(), test_labels.to_numpy())
print("Confusion Matrix:\n", matrix)
print("Accuracy: %.3f"%((matrix[0, 0] + matrix[1, 1]) / np.sum(matrix)))
precision = (matrix[0, 0] / (matrix[0, 0] + matrix[1, 0]))
print("Precision: %.3f"%precision)
recall = (matrix[0, 0] / (matrix[0, 0] + matrix[0, 1]))
print("Recall: %.3f"%recall)
print("Negative Predictive Value: %.3f"%(matrix[1, 1] / (matrix[1, 1] + matrix[0, 1])))
print("False Positive Rate: %.3f"%(matrix[0, 1] / (matrix[0, 1] + matrix[1, 1])))
print("F1 and F2 scores: %.3f, %.3f" % (f_score(1, precision, recall), f_score(2, precision, recall)))

Confusion Matrix:
 [[33 36]
 [16 94]]
Accuracy: 0.709
Precision: 0.673
Recall: 0.478
Negative Predictive Value: 0.723
False Positive Rate: 0.277
F1 and F2 scores: 0.559, 0.508


**Sthoastic**

In [None]:
model = logistic_regressor(train_data, labels, 1e-3, 1000)
model.mini_batch_train(1)

array([[ 1.59864208],
       [-1.90497177],
       [-2.6494684 ],
       [ 1.75589173]])

In [None]:
"%.4f"%model.training_accuracy()

'0.7093'

In [None]:
matrix = model.score(test_data.to_numpy(), test_labels.to_numpy())
print("Confusion Matrix:\n", matrix)
print("Accuracy: %.3f"%((matrix[0, 0] + matrix[1, 1]) / np.sum(matrix)))
precision = (matrix[0, 0] / (matrix[0, 0] + matrix[1, 0]))
print("Precision: %.3f"%precision)
recall = (matrix[0, 0] / (matrix[0, 0] + matrix[0, 1]))
print("Recall: %.3f"%recall)
print("Negative Predictive Value: %.3f"%(matrix[1, 1] / (matrix[1, 1] + matrix[0, 1])))
print("False Positive Rate: %.3f"%(matrix[0, 1] / (matrix[0, 1] + matrix[1, 1])))
print("F1 and F2 scores: %.3f, %.3f" % (f_score(1, precision, recall), f_score(2, precision, recall)))

Confusion Matrix:
 [[32 37]
 [17 93]]
Accuracy: 0.698
Precision: 0.653
Recall: 0.464
Negative Predictive Value: 0.715
False Positive Rate: 0.285
F1 and F2 scores: 0.542, 0.492
