In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from numpy.linalg import inv
from sklearn.metrics import accuracy_score , precision_score, recall_score, f1_score, roc_auc_score

In [2]:
"""
Read csv file into DataFrame
Parameters: Path of csv file
Returns: Pandas DataFrame 
"""  
def load_data(csv_path):
    return pd.read_csv(csv_path)

In [3]:
#load D9.csv into dataframe object
q3 = load_data("dataset/D9.csv")

#have a look at the info of the data
q3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2254 entries, 0 to 2253
Data columns (total 8 columns):
Unnamed: 0    2254 non-null int64
1             2254 non-null float64
2             2254 non-null float64
3             2254 non-null float64
4             2254 non-null float64
5             2254 non-null float64
6             2254 non-null float64
7             2254 non-null int64
dtypes: float64(6), int64(2)
memory usage: 141.0 KB


In [4]:
#rename the unnamed column as 'id'
q3.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [5]:
#no. of features
n = 6 
m = q3.shape[0]

X = np.ones((m,n + 1))
y = np.array((m,1))

X[:,1] = q3['1'].values
X[:,2] = q3['2'].values
X[:,3] = q3['3'].values
X[:,4] = q3['4'].values
X[:,5] = q3['5'].values
X[:,6] = q3['6'].values

#Store Labels
y = q3['7'].values

#Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 11)

#Store species labels
class_labels = [0,1]

In [6]:
def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))

#Calculate Hessian matrix and it's inverse
theta_mat = np.random.randn(7,1)
p_x = sigmoid(X_train.dot(theta_mat))
p_x_transpose = (1-p_x).T

W = p_x.dot(p_x_transpose)
np.shape(W)

for i in range(len(W)):
    for j in range(len(W)):
        if(i!=j):
            W[i][j]=0
X_transpose = X_train.T
Hessian = X_transpose.dot(W.dot(X_train))
Hessian_inv = inv(Hessian)

In [7]:
#Methods for Logistic Regression Newton's Method
def Gradient_newton(theta, X, y):
    m, n = X.shape
    theta = theta.reshape((n, 1))
    y = y.reshape((m, 1))
    h = sigmoid(X.dot(theta))
    return ((1 / m) * X.T.dot(h - y)) 

def logisticRegression_hessian(X, y,B):
    B = B - Hessian_inv.dot(Gradient_newton(B,X,y))
    return B 

In [8]:
#to store theta for all three class labels
all_theta = np.zeros((3, n + 1))

i = 0
for label in class_labels:
    #set the labels in 0 and 1
    tmp_y = np.array(y_train == label, dtype = int)
    optTheta = logisticRegression_hessian(X_train, tmp_y, np.zeros((n + 1,1)))
    all_theta[i] = list(optTheta)
    i += 1    

In [11]:
#Calculate probability for each label
P = sigmoid(X_test.dot(all_theta.T)) 
p = [class_labels[np.argmax(P[i, :])] for i in range(X_test.shape[0])]

In [12]:
#Calculate accuracy of the model
accuracy = accuracy_score(y_test,p,normalize=True, sample_weight=None) * 100
print("Accuracy: ", accuracy, "%")

Accuracy:  81.98924731182797 %


In [13]:
#Calculate Precision of the model
precision = precision_score(y_test,p,sample_weight=None)
print("Precision: ", precision)

Precision:  0.8357988165680473


In [14]:
#Calculate recall of the model
recall = recall_score(y_test,p,sample_weight=None)
print("Recall: ", recall)

Recall:  0.9608843537414966


In [15]:
#Calculate F1 score of the model
f1 = f1_score(y_test,p,sample_weight=None)
print("F1 score: ", f1)

F1 score:  0.8939873417721519
