In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc

In [2]:
"""
Read csv file into DataFrame
Parameters: Path of csv file
Returns: Pandas DataFrame 
"""  
def load_data(csv_path):
    return pd.read_csv(csv_path)

In [3]:
#load D9.csv into dataframe object
q3 = load_data("dataset/D9.csv")

#have a look at the info of the data
q3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2254 entries, 0 to 2253
Data columns (total 8 columns):
Unnamed: 0    2254 non-null int64
1             2254 non-null float64
2             2254 non-null float64
3             2254 non-null float64
4             2254 non-null float64
5             2254 non-null float64
6             2254 non-null float64
7             2254 non-null int64
dtypes: float64(6), int64(2)
memory usage: 141.0 KB


In [4]:
#rename the unnamed column as 'id'
q3.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [5]:
#no. of features
n = 6 
m = q3.shape[0]

X = np.ones((m,n + 1))
y = np.array((m,1))

X[:,1] = q3['1'].values
X[:,2] = q3['2'].values
X[:,3] = q3['3'].values
X[:,4] = q3['4'].values
X[:,5] = q3['5'].values
X[:,6] = q3['6'].values

#Store Labels
y = q3['7'].values

#Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

#Store species labels
class_labels = [0,1]

In [6]:
#Methods for logistic regression gradient descent
def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))

def Gradient(theta, X, y):
    m, n = X.shape
    theta = theta.reshape((n, 1))
    y = y.reshape((m, 1))
    h = sigmoid(X.dot(theta))
    return ((1 / m) * X.T.dot(h - y)) 

def logisticRegression(X, y,B,itr):
    for i in range(itr):
        B = B - 0.01 * Gradient(B,X,y)
    return B    

In [7]:
#to store theta for all three class labels
all_theta = np.zeros((3, n + 1))

#One vs all
i = 0
for label in class_labels:
    #set the labels in 0 and 1
    tmp_y = np.array(y_train == label, dtype = int)
    optTheta = logisticRegression(X_train, tmp_y, np.zeros((n + 1,1)),10000)
    all_theta[i] = list(optTheta)
    i += 1

In [8]:
#Calculate probability for each label
prob = sigmoid(X_test.dot(all_theta.T))
#Calculate predicted label
y_pred = [class_labels[np.argmax(prob[i, :])] for i in range(X_test.shape[0])]
prob = y_pred

In [9]:
#Calculate accuracy of the model
accuracy = accuracy_score(y_test,y_pred,normalize=True, sample_weight=None) * 100
print("Accuracy: ", accuracy, "%")

Accuracy:  79.61595273264402 %


In [10]:
#Calculate Precision of the model
precision = precision_score(y_test,y_pred,sample_weight=None)
print("Precision: ", precision)

Precision:  0.7961595273264401


In [11]:
#Calculate recall of the model
recall = recall_score(y_test,y_pred,sample_weight=None)
print("Recall: ", recall)

Recall:  1.0


In [12]:
#Calculate F1 score of the model
f1 = f1_score(y_test,y_pred,sample_weight=None)
print("F1 score: ", f1)

F1 score:  0.8865131578947368


In [13]:
#Calculate AUC
auc = roc_auc_score(y_test, prob)
print("AUC: ",auc)

AUC:  0.5
