# Logistic Regression Classifier for Breast Tissue Image Analysis


In [3]:
import pandas as pd
head_data = pd.read_csv('LRTrain.csv')
head_data.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,12.18,20.52,77.22,458.7,0.08013,0.04038,0.02383,0.0177,0.1739,0.05677,...,32.84,84.58,547.8,0.1123,0.08862,0.1145,0.07431,0.2694,0.06878,0
1,15.28,22.41,98.92,710.6,0.09057,0.1052,0.05375,0.03263,0.1727,0.06317,...,28.03,113.8,973.1,0.1301,0.3299,0.363,0.1226,0.3175,0.09772,1
2,14.76,14.74,94.87,668.7,0.08875,0.0778,0.04608,0.03528,0.1521,0.05912,...,17.93,114.2,880.8,0.122,0.2009,0.2151,0.1251,0.3109,0.08187,0
3,14.19,23.81,92.87,610.7,0.09463,0.1306,0.1115,0.06462,0.2235,0.06433,...,34.85,115.0,811.3,0.1559,0.4059,0.3744,0.1772,0.4724,0.1026,1
4,9.876,19.4,63.95,298.3,0.1005,0.09697,0.06154,0.03029,0.1945,0.06322,...,26.83,72.22,361.2,0.1559,0.2302,0.2644,0.09749,0.2622,0.0849,0


In [15]:
head_data.shape

(300, 31)

In [24]:
import numpy as np
from numpy import genfromtxt

### Question 1 ###


# Load data
data = genfromtxt('LRTrain.csv', delimiter=',',skip_header = 1)

n = data.shape[0] # 300 rows of data
d = data.shape[1]-1 # 30 features
T = 2000 # number of iterations
eps = 0.0001 # tolerance 
step = 0.00001 # step size

# get feature data
x = data[:,0:d] 

# get label data
y = data[:,d] # dummy result

w = np.zeros(d) # weight (initially = 0)

## Define gradient function

def grad(w,x,y):
    
    # initialize d-dimensional gradient vector 
    g = np.zeros(d)
    
    # construct gradient vector by looping through each of the n observations to sum them 
    for i in range(n):
        g = g + (1 / (1 + np.exp(-np.dot(x[i], w))) - y[i]) * x[i]
    return (1/n)*g

## Define negative log likelihood function
def fval(w,x,y):
    v = 0;
    for i in range(n):
        v = v + (y[i] * np.log(1 + np.exp(-np.dot(x[i], w))) + (1 - y[i]) * np.log(1 + np.exp(np.dot(x[i], w))))
    return (1/n)*v

## Define norm of the gradient 

def gradnorm(w,x,y):
    return np.linalg.norm(grad(w,x,y))

## Perform gradient descent

for t in range(T):
    
        # update weights
        w = w - step * grad(w,x,y)
        
        # print output for each iteration
        print("Step count: " + str(t) + ", Negative log likelihood: " + str(fval(w,x,y)) + ", gradient norm: " + str(gradnorm(w,x,y))) 

        # check if norm of the gradient < eps to see whether its converging 
        if gradnorm(w,x,y) < eps:
             break
                
# save final weight vector        
w_hat = w

Step count: 0, Negative log likelihood: 0.7622662841577311, gradient norm: 214.88106355941048
Step count: 1, Negative log likelihood: 1.014339210022373, gradient norm: 442.81294701192996
Step count: 2, Negative log likelihood: 1.5196777816686733, gradient norm: 405.81607456207604
Step count: 3, Negative log likelihood: 0.8457544189386083, gradient norm: 363.5324367898384
Step count: 4, Negative log likelihood: 1.3334391723291432, gradient norm: 389.6151342191855
Step count: 5, Negative log likelihood: 0.931339487920303, gradient norm: 423.3718506787496
Step count: 6, Negative log likelihood: 1.4495023905657403, gradient norm: 400.78007277605604
Step count: 7, Negative log likelihood: 0.8242431480898115, gradient norm: 369.41240888657137
Step count: 8, Negative log likelihood: 1.3091637817114496, gradient norm: 387.74177666470194
Step count: 9, Negative log likelihood: 0.8797604473058311, gradient norm: 412.36203687528047
Step count: 10, Negative log likelihood: 1.389358194092492, gradi

In [None]:
### Question 2 ###



# Load test data
data = genfromtxt('LRTest.csv', delimiter=',',skip_header = 1)

n = data.shape[0]
d = data.shape[1]-1

# get feature data
x = data[:,0:d]

# get label data
y = data[:,d]

# set threshold
t = np.arange(0, 1.1, 0.1) # from 0 to 1

# make predictions on test data using trained weights w_hat
pred = np.zeros(n)
for i in range(n):
    if 1/(1+np.exp(-np.dot(w_hat,x[i,:])))>t:
        pred[i]=1
    elif 1/(1+np.exp(-np.dot(w_hat,x[i,:])))<t:
        pred[i]=0

        
TP = np.zeros(n) # number of true positives 
FP = np.zeros(n) # number of false positives: true label = 0 but classified as 1
TN = np.zeros(n) # number of true negatives
FN = np.zeros(n) # number of false negatives: true label = 1 but classified as 0

for i in range(n):
    if pred[i]==1 and y[i] == 0:
        FP[i]=1
    elif pred[i]==0 and y[i] == 1:
        FN[i]=1
    elif pred[i]==1 and y[i] == 1:
        TP[i]=1
    elif pred[i]==0 and y[i] == 0:
        TN[i]=1
        
# Now use TP, FP, TN, and FN to calculate TPR, FPR, TNR, and FNR: 
        

In [23]:
### Question 2 ###

# Use the (incomplete) code below as a starting point

# Load test data
data = genfromtxt('LRTest.csv', delimiter=',',skip_header = 1)

n = data.shape[0]
d = data.shape[1]-1

# get feature data
x = data[:,0:d]

# get label data
y = data[:,d]

# set threshold
thresholds = np.arange(0, 1.1, 0.1) # from 0 to 1

# make predictions on test data using trained weights w_hat

for t in thresholds:
    pred = np.zeros(n)
    # (method 1)
    for i in range(n):
        if 1 / (1+np.exp(-np.dot(w_hat,x[i,:]))) > t:
            pred[i] = 1
        elif 1/(1+np.exp(-np.dot(w_hat,x[i,:]))) < t:
            pred[i] = 0
    # (method 2)
    #for i in range(n):
    #    prob = 1/(1+np.exp(-np.dot(w_hat,x[i,:])))
    #    pred[i] = 1 if prob > t else 0
        
    TP = 0 # num of true positives: true label = 1 and classified as 1
    FP = 0 # num of false positives: true label = 0 but classified as 1
    TN = 0 # num of true negatives: true label = 0 and classified as 0
    FN = 0 # num of false negatives: true label = 1 but classified as 0

    for i in range(n):
        if pred[i] == 1 and y[i] == 0:
            FP += 1
        elif pred[i] == 0 and y[i] == 1:
            FN += 1
        elif pred[i] == 1 and y[i] == 1:
            TP += 1
        elif pred[i] == 0 and y[i] == 0:
            TN += 1
            
    # Now use TP, FP, TN, and FN to calculate TPR, FPR, TNR, and FNR: 

    TP_rate = TP / (TP + FN) if TP + FN > 0 else 0
    FP_rate = FP / (FP + TN) if FP + TN > 0 else 0
    TN_rate = TN / (TN + FP) if TN + FP > 0 else 0
    FN_rate = FN / (FN + TP) if FN + TP > 0 else 0

    print(f"Threshold: {t:.1f}, TP_rate: {TP_rate:.3f}, FP_rate: {FP_rate:.3f}, TN_rate: {TN_rate:.3f}, FN_rate: {FN_rate:.3f}")

Threshold: 0.0, TP_rate: 1.000, FP_rate: 1.000, TN_rate: 0.000, FN_rate: 0.000
Threshold: 0.1, TP_rate: 0.980, FP_rate: 0.585, TN_rate: 0.415, FN_rate: 0.020
Threshold: 0.2, TP_rate: 0.959, FP_rate: 0.216, TN_rate: 0.784, FN_rate: 0.041
Threshold: 0.3, TP_rate: 0.908, FP_rate: 0.105, TN_rate: 0.895, FN_rate: 0.092
Threshold: 0.4, TP_rate: 0.888, FP_rate: 0.058, TN_rate: 0.942, FN_rate: 0.112
Threshold: 0.5, TP_rate: 0.857, FP_rate: 0.041, TN_rate: 0.959, FN_rate: 0.143
Threshold: 0.6, TP_rate: 0.847, FP_rate: 0.029, TN_rate: 0.971, FN_rate: 0.153
Threshold: 0.7, TP_rate: 0.786, FP_rate: 0.012, TN_rate: 0.988, FN_rate: 0.214
Threshold: 0.8, TP_rate: 0.735, FP_rate: 0.006, TN_rate: 0.994, FN_rate: 0.265
Threshold: 0.9, TP_rate: 0.714, FP_rate: 0.000, TN_rate: 1.000, FN_rate: 0.286
Threshold: 1.0, TP_rate: 0.000, FP_rate: 0.000, TN_rate: 1.000, FN_rate: 1.000
