In [25]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import math
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [2]:
"""
Read csv file into DataFrame
Parameters: Path of csv file
Returns: Pandas DataFrame 
"""  
def load_data(csv_path):
    return pd.read_csv(csv_path)

In [3]:
#load D9.csv into dataframe object
q3 = load_data("dataset/D9.csv")

#have a look at the info of the data
q3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2254 entries, 0 to 2253
Data columns (total 8 columns):
Unnamed: 0    2254 non-null int64
1             2254 non-null float64
2             2254 non-null float64
3             2254 non-null float64
4             2254 non-null float64
5             2254 non-null float64
6             2254 non-null float64
7             2254 non-null int64
dtypes: float64(6), int64(2)
memory usage: 141.0 KB


In [5]:
#rename the unnamed column as 'id'
q3.rename(columns={'Unnamed: 0':'id'}, inplace=True)

In [10]:
#no. of features
n = 6 
m = q3.shape[0]

X = np.ones((m,n + 1))
y = np.array((m,1))

X[:,1] = q3['1'].values
X[:,2] = q3['2'].values
X[:,3] = q3['3'].values
X[:,4] = q3['4'].values
X[:,5] = q3['5'].values
X[:,6] = q3['6'].values

#Store Labels
y = q3['7'].values

#Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 11)

#Store species labels
class_labels = [0,1]

In [13]:
#Logistic Regression using Library
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

print('Accuracy: ',accuracy_score(y_test,y_pred)*100, "%")

Accuracy:  79.56989247311827 %


In [15]:
"""
Calculate F1 score of the model
Parameters:
tags - list of actual label
predicted - list of label predicted by the model
Returns: F1 score
"""
def F1_score(tags,predicted):
    tags = set(tags)
    predicted = set(predicted)

    tp = len(tags & predicted)
    fp = len(predicted) - tp 
    fn = len(tags) - tp
    if tp>0:
        precision = float(tp) / (tp + fp)
        recall = float(tp) / (tp + fn)
        print("Precision: ", precision, "Recall: ", recall)

        return 2 * ((precision * recall) / (precision + recall))
    else:
        return 0

In [17]:
#Calculate F1 score
f1_score = F1_score(y_test,y_pred)
print("F1 Score: ", f1_score)

Precision:  1.0 Recall:  1.0
F1 Score:  1.0


In [26]:
def precision(y_true, y_pred):
    i = set(y_true).intersection(y_pred)
    len1 = len(y_pred)
    if len1 == 0:
        return 0
    else:
        return len(i) / len1
    
print(precision(y_test, y_pred))

def recall(y_true, y_pred):
    i = set(y_true).intersection(y_pred)
    return len(i) / len(y_true)

print(recall(y_test, y_pred))

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    if p + r == 0:
        return 0
    else:
        return 2 * (p * r) / (p + r)
    
print(f1(y_test, y_pred))

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred)
    roc_auc[i] = auc(fpr[i], tpr[i])

print(roc_auc_score(y_test, y_pred))

0.002688172043010753
0.002688172043010753
0.002688172043010753
0.5128205128205128
