In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import timeit
from numpy.linalg import inv

In [2]:
"""
Read csv file into DataFrame
Parameters: Path of csv file
Returns: Pandas DataFrame 
"""  
def load_data(csv_path):
    return pd.read_csv(csv_path)

In [3]:
#load housing.csv into dataframe object
iris_data = load_data("dataset/iris.csv")

#have a look at the data
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
#no. of features
n = 4 
m = iris_data.shape[0]

X = np.ones((m,n + 1))
y = np.array((m,1))

X[:,1] = iris_data['sepal_length'].values
X[:,2] = iris_data['sepal_width'].values
X[:,3] = iris_data['petal_length'].values
X[:,4] = iris_data['petal_width'].values

#Store labels
y = iris_data['species'].values

#Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 11)

#Store species labels
Species = ['setosa','virginica','versicolor']

In [5]:
def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))

#Calculate Hessian matrix and it's inverse
theta_mat = np.random.randn(5,1)
p_x = sigmoid(X_train.dot(theta_mat))
p_x_transpose = (1-p_x).T

W = p_x.dot(p_x_transpose)
np.shape(W)

for i in range(len(W)):
    for j in range(len(W)):
        if(i!=j):
            W[i][j]=0
X_transpose = X_train.T
Hessian = X_transpose.dot(W.dot(X_train))
Hessian_inv = inv(Hessian)

In [6]:
#Methods for Logistic Regression Newton's Method
def Gradient_newton(theta, X, y):
    m, n = X.shape
    theta = theta.reshape((n, 1))
    y = y.reshape((m, 1))
    h = sigmoid(X.dot(theta))
    

    return ((1 / m) * X.T.dot(h - y)) 

def logisticRegression_hessian(X, y, B):
    B = B - Hessian_inv.dot(Gradient_newton(B,X,y))
    return B     

In [7]:
#to store theta for all three class labels
all_theta = np.zeros((3, n + 1))

#Find current time
start_time = timeit.default_timer()

i = 0
for flower in Species:
    #set the labels in 0 and 1
    tmp_y = np.array(y_train == flower, dtype = int)
    optTheta = logisticRegression_hessian(X_train, tmp_y, np.zeros((n + 1,1)))
    all_theta[i] = list(optTheta)
    i += 1

#Calculate training time
end_time = timeit.default_timer()
training_time = end_time - start_time
print("Training time: ", training_time, "seconds")

Training time:  0.000720042000466492 seconds


In [8]:
#Calculate probability for each flower
P = sigmoid(X_test.dot(all_theta.T)) 
p = [Species[np.argmax(P[i, :])] for i in range(X_test.shape[0])]

print("Accuracy: ", accuracy_score(y_test, p) * 100 , '%')

Accuracy:  62.0 %
