In [None]:
from scipy.io import loadmat
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### COMMON METHODS

In [None]:
def update_parameters(w_old,grad,step_size):
    w_new = w_old + step_size*grad
    return w_new    

In [None]:
def calculate_gradient(X,y,w,algorithm): 
    m = X.shape[0]
    X_augmented = np.concatenate((np.ones((m,1)),X),axis=1)
    if algorithm.lower() == 'logreg':        
        error_in_estimate = y-(np.exp(X_augmented.dot(w))/(1+np.exp(X_augmented.dot(w))))
    elif algorithm.lower() == 'linreg':
        error_in_estimate = (y-X_augmented.dot(w))
    else:
        raise Exception('Algorithm type unclear!')        
        
    grad = (X_augmented.T).dot(error_in_estimate)
    return grad
    

In [None]:
def gradient_ascent(X,y,tolerance,w_start,step_size,algorithm):
    
    iteration = 0
    w_current = w_start
    current_grad = calculate_gradient(X,y,w_start,algorithm)    
    current_grad_norm = np.linalg.norm(current_grad)
    grad_norms = []
    grad_norms.append(current_grad_norm)
    
    while current_grad_norm > tolerance:        
        w_updated = update_parameters(w_current,current_grad,step_size)
        w_current = w_updated        
        current_grad = calculate_gradient(X,y,w_current,algorithm)
        current_grad_norm = np.linalg.norm(current_grad)
        grad_norms.append(current_grad_norm)
        iteration +=1
        print(f'Iteration: {iteration}. ||grad|| : {current_grad_norm}')
    
    plt.plot(list(range(iteration+1)),grad_norms)
    plt.xlabel('iteration')
    plt.ylabel('Grad_Norm')
    plt.grid()
    plt.show()
    return w_current        

In [None]:
def makePredictions(parameters_estimated,X,algorithm):    
    X_augmented = np.concatenate((np.ones((X.shape[0],1)),X),axis=1) 
    if algorithm.lower() == 'logreg':
        z = X_augmented.dot(parameters_estimated)   
        locations = z>0
        pred = locations.astype(int)
    elif algorithm.lower() == 'linreg':        
        pred = X_augmented.dot(parameters_estimated)
    else:
        raise Exception('Algorithm type unclear!')        
    return pred

### LINEAR REGRESSION

In [None]:
# Generate Data For Linear Regression
def generateDataForLinReg(m,n,variance,d):
    '''Output: X - mxn , the data-points with m instances and n features, with all points within an n-dimensonal box bounded between -d and d
               y - nx1 , the labels, with y=f(x) + noise, noise is Normal(0,variance),
                         f(x) = w0 + w1x1 + ... + wnxn
               wts - (n+1)x1 the actual parameters of the line used (w0,...,wn)
    '''
    Z = d*(2*np.random.random((m,n)) - 1)
    X = np.concatenate((np.ones((m,1)),Z),axis=1)
    w = np.random.randn(n+1,1)
#     noise = np.sqrt(variance)*np.random.randn(m,1)
    noise_above = np.sqrt(variance)*np.random.randn(m//2,1)
    noise_below = np.sqrt(variance)*np.random.randn(m//2,1)
    noise = np.concatenate((noise_above,noise_below))
#     print(noise.shape)
    y = X.dot(w) + noise
    if n==1:        
        plt.plot(Z,X.dot(w),'-b')
        plt.scatter(Z,y)
        plt.grid()    
        plt.xlabel('x1')
        plt.ylabel('x2')        
    return (Z,y,w)

In [None]:
def plotPredicted(X,y,params_estimated):
    (m,n) = X.shape
    if n>1:
        raise Exception('Cannot plot multi-dimensional data!')    
    X_augmented = np.concatenate((np.ones((m,1)),X),axis=1)
    plt.plot(X,X_augmented.dot(params_estimated),'-b')
    plt.scatter(X,y)
    plt.grid()    
    plt.title('Line of best fit')
    plt.xlabel('x1')
    plt.ylabel('x2')        

In [None]:
def getRMSEscore(y_pred,y_actual):
    Nsamples = y_pred.size
    return np.sqrt(np.sum((y_pred-y_actual)**2/Nsamples))

In [None]:
#Set parameters for linear regression
m=200
n=1
sigma = 2
d=10
XTrain,yTrain,theta_actual = generateDataForLinReg(m,n,sigma,d)
X_augmented = np.concatenate((np.ones((m,1)),XTrain),axis=1)

In [None]:
#Run Linear Regression
tol = 1e-3
n = XTrain.shape[1]
w_0 = np.random.randn(n+1,1)
step = 1e-5
algorithm = 'linreg'
w_estimate = gradient_ascent(XTrain,yTrain,tol,w_0,step,algorithm)
print(f'Estimated optimal solution: {w_estimate}')

In [None]:
#Make predictions
y_pred_train = makePredictions(w_estimate,XTrain,algorithm)

In [None]:
# Chek accuracy
rmse = getRMSEscore(y_pred_train,yTrain)
print(f'RMSE: {rmse}')
plotPredicted(XTrain,yTrain,w_estimate)

### LOGISTIC REGRESSION

In [None]:
# Generate Data For Logistic Regression
def generateDataForLogReg():
    data = loadmat('logRegData.mat')
    XTrain = data['XTrain']
    XTest = data['XTest']
    yTrain = data['yTrain']
    yTest = data['yTest']
    n,p = XTrain.shape
    m,temp = XTest.shape
    print(f'n = {n}, p = {p}, m = {m}')
    return(XTrain,yTrain,XTest,yTest)

In [None]:
def getErrorRate(y_pred,y_actual):
    Nsamples = y_pred.size
    return np.sum(np.abs(y_pred-y_actual))/Nsamples

In [None]:
def getConfusionMatrix(yPred,yActual):
    '''
    Returns the confusion matrix for all classes
    INPUT: yPred,yActual, - 1xm numpy arrays
    OUTPUT: ConfMat is a 2x2xK numpy array where ConfMat[0,0,k] are TP for class k,ConfMat[0,1,k] are FP for class k 
            ConfMat[1,0,k] are TN for class k and ConfMat[1,1,k] are FN for class k
    '''
    classes = np.unique(yActual)
    K = classes.size
    M = yActual.size
    ConfMat = np.zeros((2,2,K))
    for k in range(K):
        for m in range(M):
            if yActual[m]==k and yPred[m]==k : #True Positive
                ConfMat[0,0,k]+=1
            elif yActual[m]!=k and yPred[m]==k : #False Positive
                ConfMat[0,1,k]+=1
            elif yActual[m]!=k and yPred[m]!=k : #True Negative
                ConfMat[1,0,k]+=1
            else:
                ConfMat[1,1,k]+=1
    return ConfMat
            

In [None]:
def getPrecisionRecallF1Scores(confusionMat):
    '''
    Finds the Precision, Recall and the f1-score for all classes based on the confusion matrix
    INPUT: confusionMaAt, a 2x2xK numpy array - confusion matrix
    OUTPUT:scores is a 1x3xK numpy array where scores(0,0,k) is precision for class k , scores(0,1,k) is recall for class k and 
    scores(0,2,k) is f1-score for class k 
    '''
    K = confusionMat.shape[2]
    scores = np.zeros((1,3,K))
    for k in range(K):
        TP = confusionMat[0,0,k]
        FP = confusionMat[0,1,k]
        TN = confusionMat[1,0,k]
        FN = confusionMat[1,1,k]
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        f1 = 2/((1/precision)+(1/recall))
        scores[0,0,k] = precision
        scores[0,1,k] = recall
        scores[0,2,k] = f1
    
    return scores    

In [None]:
#Set parameters for linear regression
(XTrain,yTrain,XTest,yTest) = generateDataForLogReg()
# csv_path = "C://Users/saurmisr/Downloads/titanic.csv"
# data = pd.read_csv(csv_path)
# X = data.drop(labels=['Survived','Name'],axis=1)
# X['Sex']=X['Sex'].apply(lambda X:0 if X=='male' else 0)
# y = data['Survived']
# XTrain = X.to_numpy()
# yTrain = y.to_numpy()
# XTest = XTrain
# yTest = yTrain

In [None]:
# Run Logistic Regression

tol = 1e-3
p = XTrain.shape[1]
w_0 = np.random.randn(p+1,1)
step = 1e-4
# w_0 = np.zeros((p+1,1))
algorithm = 'logreg'
w_estimate = gradient_ascent(XTrain,yTrain,tol,w_0,step,algorithm)
print(f'Estimated optimal solution: {w_estimate}')

In [None]:
# Make predictions
algorithm = 'logreg'
y_pred_train = makePredictions(w_estimate,XTrain,algorithm)
y_pred_test = makePredictions(w_estimate,XTest,algorithm)

In [None]:
#Check accuracy
error_rate_train = getErrorRate(y_pred_train,yTrain)
error_rate_test = getErrorRate(y_pred_test,yTest)
print(f'Error rates : {(error_rate_train,error_rate_test)}')
print(f'Success rates : {(1-error_rate_train,1-error_rate_test)}')

In [None]:
CMat = getConfusionMatrix(y_pred_train,yTrain)
scores = getPrecisionRecallF1Scores(CMat)
print(f'PRECISION FOR Label 1: {scores[0,0,1]}')
print(f'Recall FOR Label 1: {scores[0,1,1]}')
print(f'f1-score FOR Label 1: {scores[0,2,1]}')
print(f'PRECISION FOR Label 0: {scores[0,0,0]}')
print(f'Recall FOR Label 0: {scores[0,1,0]}')
print(f'f1-score FOR Label 0: {scores[0,2,0]}')