In [99]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from statistics import mean

In [14]:
# Reading the CSV file by specifying the column names 
colnames = ['AGE','YEAR', 'NODES_DETECTED', 'SURVIVAL_STATUS']   
data = pd.read_csv('a1_data.csv',names=colnames, header=None)

In [16]:
# checking for missing values 
data.isna().sum()

AGE                2
YEAR               1
NODES_DETECTED     2
SURVIVAL_STATUS    0
dtype: int64

In [17]:
data['AGE'].fillna(data['AGE'].quantile(0.5), inplace=True)

In [18]:
data['YEAR'].fillna(data['YEAR'].loc[0], inplace=True)
data['NODES_DETECTED'].fillna(data['NODES_DETECTED'].loc[0], inplace=True)

In [19]:
data.describe()

Unnamed: 0,AGE,YEAR,NODES_DETECTED,SURVIVAL_STATUS
count,306.0,306.0,306.0,306.0
mean,52.555556,62.846405,4.026144,1.264706
std,10.729086,3.245062,7.189197,0.441899
min,30.0,58.0,0.0,1.0
25%,44.0,60.0,0.0,1.0
50%,52.0,63.0,1.0,1.0
75%,60.75,65.0,4.0,2.0
max,83.0,69.0,52.0,2.0


In [20]:
X = data[['AGE', 'YEAR', 'NODES_DETECTED']]
Y = data['SURVIVAL_STATUS']

In [30]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

**LOGISTICS REGRESSION**

In [31]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression()

In [32]:
y_pred = clf.predict(x_test)

In [33]:
# CLASSIFICATION MATRICS 
print(accuracy_score(y_test, y_pred))  
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.6753246753246753
0.6712328767123288
0.98
0.7967479674796748


In [34]:
# Confusion matrix for test set 
confusion_matrix(y_test, y_pred)

array([[49,  1],
       [24,  3]], dtype=int64)

In [35]:
# Cross Validation for 12 different Train-Test set 
print(cross_val_score(clf,X, Y, cv=12, scoring='accuracy'))

[0.73076923 0.76923077 0.73076923 0.80769231 0.73076923 0.73076923
 0.76       0.68       0.8        0.72       0.8        0.68      ]


Maximum able to get 80% accuracy with Logistics Regression model 

**K NEAREST NEIGHBOUR**

In [139]:
knn = KNeighborsClassifier(n_neighbors=4)

In [140]:
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=4)

In [141]:
y_pred = knn.predict(x_test)

In [142]:
confusion_matrix(y_test, y_pred)

array([[49,  1],
       [21,  6]], dtype=int64)

In [144]:
print(cross_val_score(knn,X, Y, cv=12, scoring='accuracy'))

[0.76923077 0.73076923 0.53846154 0.65384615 0.73076923 0.76923077
 0.76       0.76       0.84       0.68       0.72       0.68      ]


In [145]:
mean(cross_val_score(knn,X, Y, cv=12, scoring='accuracy'))

0.7193589743589743

Maximum able to get 84% accuracy with KNN model and with average accuracy of 71.93% 

**SUPPORT VECTOR MACHINE**

In [92]:
clf_svm = svm.SVC(kernel='linear')

In [94]:
clf_svm.fit(x_train, y_train)

SVC(kernel='linear')

In [96]:
y_pred = clf.predict(x_test)

In [97]:
confusion_matrix(y_test, y_pred)

array([[49,  1],
       [24,  3]], dtype=int64)

In [98]:
print(cross_val_score(clf_svm,X, Y, cv=12, scoring='accuracy'))

[0.69230769 0.76923077 0.73076923 0.73076923 0.73076923 0.73076923
 0.72       0.76       0.76       0.68       0.72       0.72      ]


In [146]:
mean(cross_val_score(clf_svm,X, Y, cv=12, scoring='accuracy'))

0.7287179487179487

Maximum able to get 73.0% accuracy with KNN model and with average accuracy of 72.87% 