In [1]:
# Running K-nearest neighbour, Naive bayes and Support Vector Machine algorithms on 'lung_cancer_examples.csv'
# file from kaggle.com data sets.

# importing libraries and file to be processed
import numpy as np # numerical python
import pandas as pd # data manipulation,cleaning and analysis
lung=pd.read_csv("/home/samuel/Downloads/analysis_files/lung_cancer_examples.csv") # reading file
print(lung.head())

     Name      Surname  Age  Smokes  AreaQ  Alkhol  Result
0    John         Wick   35       3      5       4       1
1    John  Constantine   27      20      2       5       1
2  Camela     Anderson   30       0      5       2       0
3    Alex       Telles   28       0      8       1       0
4   Diego     Maradona   68       4      5       6       1


In [2]:
# Selecting relevant columns and their rows
x=lung.iloc[:,2:6].values #input label
y=lung.iloc[:,-1].values #output label
#count number of distinct outputs
from collections import Counter
print("The number of classes",Counter(y)) # from output we see the data set is fully balanced

The number of classes Counter({0: 31, 1: 28})


In [3]:
# Splitting data set for training and testing
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
print(lung.describe())

             Age     Smokes      AreaQ     Alkhol     Result
count  59.000000  59.000000  59.000000  59.000000  59.000000
mean   42.627119  15.067797   5.203390   3.237288   0.474576
std    16.235230   7.984607   2.461984   2.380517   0.503640
min    18.000000   0.000000   1.000000   0.000000   0.000000
25%    29.000000  10.000000   3.000000   1.000000   0.000000
50%    39.000000  15.000000   5.000000   3.000000   0.000000
75%    55.000000  20.000000   7.500000   5.000000   1.000000
max    77.000000  34.000000  10.000000   8.000000   1.000000


In [4]:
#KNN algorithm
# we use min max scaler to bring data into a common range

from sklearn import preprocessing
import numpy as np
min_max_scaler=preprocessing.MinMaxScaler()
min_max_scaler.fit(x_train)
x_train=min_max_scaler.transform(x_train)
x_test=min_max_scaler.transform(x_test)

print(x_test)

[[0.62711864 0.58823529 0.         0.5       ]
 [0.28813559 0.58823529 0.44444444 0.125     ]
 [0.74576271 0.44117647 0.44444444 0.625     ]
 [0.6440678  0.58823529 0.22222222 0.375     ]
 [0.96610169 0.44117647 0.22222222 0.625     ]
 [0.13559322 0.29411765 0.66666667 0.25      ]
 [0.62711864 0.44117647 0.         0.375     ]
 [0.20338983 0.         0.44444444 0.25      ]
 [0.49152542 0.44117647 0.         1.        ]
 [0.25423729 0.11764706 0.77777778 0.        ]
 [0.54237288 0.58823529 0.11111111 0.5       ]
 [0.25423729 0.73529412 0.77777778 0.25      ]
 [0.01694915 0.35294118 0.77777778 0.        ]
 [0.28813559 0.08823529 0.44444444 0.5       ]
 [0.27118644 0.         1.         0.        ]
 [0.57627119 0.52941176 0.33333333 0.625     ]
 [0.40677966 0.64705882 0.22222222 0.625     ]
 [0.22033898 0.58823529 0.88888889 0.5       ]]


In [5]:
# setting value of 'k', training and running prediction

from sklearn.neighbors import KNeighborsClassifier
model1=KNeighborsClassifier(n_neighbors=5)
model1.fit(x_train,y_train) # training
y_pred1=model1.predict(x_test) # testing
print(y_pred1)

[1 0 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0]


In [6]:
# Performance evaluation
# Even if the dataset is fully balanced, we'll also print the classification report along with accuracy of algorithm
# and confusion matrix for the data set

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
acc=accuracy_score(y_test,y_pred1)
matrix=confusion_matrix(y_test,y_pred1)
report=classification_report(y_test,y_pred1)
print(acc)
print(matrix)
print(report)

0.9444444444444444
[[8 0]
 [1 9]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.90      0.95        10

    accuracy                           0.94        18
   macro avg       0.94      0.95      0.94        18
weighted avg       0.95      0.94      0.94        18



In [7]:
# Naive Bayes algorithm

from sklearn.naive_bayes import GaussianNB # we use gaussian as the input labels are continious not categorical
model2=GaussianNB()
model2.fit(x_train,y_train) # training
y_pred2=model2.predict(x_test) # testing
print(y_pred2)

[1 0 1 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0]


In [8]:
# Performance evaluation

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
acc=accuracy_score(y_test,y_pred2)
matrix=confusion_matrix(y_test,y_pred2)
report=classification_report(y_test,y_pred2)
print(acc)
print(matrix)
print(report)

1.0
[[ 8  0]
 [ 0 10]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00        10

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18



In [9]:
# SVM algorithm
# Using Standard scaler to bring values to a common range.

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [10]:
# Importing SVM alg.
from sklearn.svm import SVC
classifier=SVC()
classifier.fit(x_train,y_train) #training
y_pred3=classifier.predict(x_test) #testing

In [11]:
# Performance evaluation
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
acc=accuracy_score(y_test,y_pred3)
matrix=confusion_matrix(y_test,y_pred3)
report=classification_report(y_test,y_pred3)
print(acc)
print(matrix)
print(report)

0.9444444444444444
[[8 0]
 [1 9]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.90      0.95        10

    accuracy                           0.94        18
   macro avg       0.94      0.95      0.94        18
weighted avg       0.95      0.94      0.94        18

