In [95]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.svm import SVC 
 
import warnings
warnings.filterwarnings("ignore")

## Data preprocessing

In [98]:
#Loading the dataset using pandas into a dataframe
df = pd.read_csv("/Users/nithya/Downloads/breast-cancer-wisconsin.csv", header = None,na_values='?')
df.head(25)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
5,1017122,8,10,10,8,7,10.0,9,7,1,4
6,1018099,1,1,1,1,2,10.0,3,1,1,2
7,1018561,2,1,2,1,2,1.0,3,1,1,2
8,1033078,2,1,1,1,2,1.0,1,1,5,2
9,1033078,4,2,1,1,2,1.0,2,1,1,2


In [99]:
#Filling the null values with the mean
df = df.fillna(df.mean())

In [100]:
df.head(25)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
5,1017122,8,10,10,8,7,10.0,9,7,1,4
6,1018099,1,1,1,1,2,10.0,3,1,1,2
7,1018561,2,1,2,1,2,1.0,3,1,1,2
8,1033078,2,1,1,1,2,1.0,1,1,5,2
9,1033078,4,2,1,1,2,1.0,2,1,1,2


In [103]:
feature_columns = [1,2,3,4,5,6,7,8,9] #Column 0 is ommited as it is an id column and does not contribute in determining the results(no predictive power) 
X = df[feature_columns] 
y = df[10]

## Prepocessed data

In [104]:
X

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,5,1,1,1,2,1.0,3,1,1
1,5,4,4,5,7,10.0,3,2,1
2,3,1,1,1,2,2.0,3,1,1
3,6,8,8,1,3,4.0,3,7,1
4,4,1,1,3,2,1.0,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1
695,2,1,1,1,2,1.0,1,1,1
696,5,10,10,3,7,3.0,8,10,2
697,4,8,6,4,3,4.0,10,6,1


In [105]:
y

0      2
1      2
2      2
3      2
4      2
      ..
694    2
695    2
696    4
697    4
698    4
Name: 10, Length: 699, dtype: int64

In [110]:
#Splitting to training and testing with a train size of 0.75 and test size of 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100, train_size = .75)

#Converting to Numpy as Numpy arrays are more compact than python lists, which uses less memory
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

## Decision tree

In [117]:
#Using Decision Tree Classification 
clf = tree.DecisionTreeClassifier(random_state = 1)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#Accuracy calculation
score_dtree = metrics.accuracy_score(y_test, y_pred)
print('Accuracy using Decision Tree Classifier: ', score_dtree)

#Performing k fold cross validation with k as 10
s1 = cross_val_score(clf, X_train, y_train, cv = 10)
s1.mean()
print('Accuracy using 10-fold cross validation: ', s1.mean())

Accuracy using Decision Tree Classifier:  0.9257142857142857
Accuracy using 10-fold cross validation:  0.9522496371552975


## Random forest



In [122]:
#Performing Random Forest Classification with n_estimator value as 10
rf = RandomForestClassifier(n_estimators = 10,random_state = 1)
rf.fit(X_train, y_train)
rf.predict(X_test)

#Performing k fold cross validation with k as 10
s2 = cross_val_score(rf, X_train, y_train, cv = 10)
s2.mean()
print('Accuracy using 10-fold cross validation: ', s2.mean())

Accuracy using 10-fold cross validation:  0.9637518142235123


### Random forest accuracy is slightly higher than decision tree.

## KNN


In [225]:
#Performing K Neighbors Classification with n_neighbors as 10
knn = KNeighborsClassifier(n_neighbors = 10)
knn = knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

#Accuracy calculation
score_knn = metrics.accuracy_score(y_test, y_pred)
print('Accuracy using KNeighborsClassifier: ', score_knn)

#Performing k fold cross validation with k as 10
s3 = cross_val_score(knn, X_train, y_train, cv = 10)
s3.mean()
print('Accuracy using 10-fold cross validation: ', s3.mean())

Accuracy using KNeighborsClassifier:  0.9371428571428572
Accuracy using 10-fold cross validation:  0.9733309143686503


In [235]:
#Running for loop to run KNN with K values ranging from 1 to 50
s3List = []
for i in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn = knn.fit(X_train, y_train)
    s_3 = cross_val_score(knn, X_train, y_train, cv=10)
    print(s_3.mean())
    s3List.append(s_3.mean())

#Prints Max Value in the s3 mean list
print('\nMax Accuracy: ', max(s3List)) 
#Finds the index of that max value in that list which is the best suited k nearest neighbor value
print('Best K value: ', s3List.index(max(s3List))) 
  

0.9656386066763425
0.9542089985486213
0.9694484760522496
0.9656386066763425
0.9713715529753264
0.9694847605224964
0.973367198838897
0.9733309143686503
0.9752539912917271
0.9733309143686503
0.9752539912917271
0.9752177068214805
0.9771407837445573
0.9771407837445573
0.9771407837445573
0.9752177068214802
0.9752177068214802
0.9732946298984034
0.9732946298984034
0.9732946298984034
0.9732946298984034
0.9732946298984034
0.9732946298984034
0.9732946298984034
0.9732946298984034
0.9694847605224964
0.9713715529753266
0.9675979680696661
0.9675979680696661
0.9694847605224964
0.9694847605224964
0.9694847605224964
0.9694847605224964
0.9694847605224964
0.9694847605224964
0.9675616835994194
0.9694847605224964
0.9694847605224964
0.9694847605224964
0.9694847605224964
0.9694847605224964
0.9694847605224964
0.9694847605224964
0.9675616835994194
0.9675616835994194
0.9675616835994194
0.9675616835994194
0.9675616835994194
0.9675616835994194

Max Accuracy:  0.9771407837445573
Best K value:  12


### The value of K doesn't make significant difference as there is only slight increase/decrease in accuracy as K value changes.
### The best performance from KNN using 10-fold cross validation is 0.9771407837445573.

## Naive Bayes

In [243]:
#Using GaussianNB 
clf = GaussianNB()
clf.fit(X_train, y_train)
sk_pred = clf.predict(X_test)

#Accuracy calculation
score_gnb = metrics.accuracy_score(y_test, sk_pred)
print('Accuracy: ', score_gnb)

#Performing k fold cross validation with k as 10
s4 = cross_val_score(clf, X_train, y_train, cv = 10)
s4.mean()
print('Accuracy using 10-fold cross validation: ', s4.mean())

Accuracy:  0.9371428571428572
Accuracy using 10-fold cross validation:  0.9675979680696661


In [251]:
#Using MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
sk_pred = clf.predict(X_test)

#Accuracy calculation
score_mnb = metrics.accuracy_score(y_test, sk_pred)
print('Accuracy: ', score_mnb)

#Performing k fold cross validation with k as 10
s5 = cross_val_score(clf, X_train, y_train, cv = 10)
s5.mean()
print('Accuracy using 10-fold cross validation: ', s5.mean())

Accuracy:  0.8685714285714285
Accuracy using 10-fold cross validation:  0.8988388969521044


### GaussianNB performs better than MultinomialNB.

In [256]:
#Printing accuracy of all the Models in Step 2 after perfoming 10 fold cross validation
print('Accuracy - Decision Tree: ', s1.mean())
print('Accuracy - Random Forest: ', s2.mean())
print('Accuracy - K Nearest Neighbors: ', s3.mean())
print('Accuracy - Gaussian Naive Bayes: ', s4.mean())
print('Accuracy - Multinomial Naive Bayes: ', s5.mean())

Accuracy - Decision Tree:  0.9522496371552975
Accuracy - Random Forest:  0.9637518142235123
Accuracy - K Nearest Neighbors:  0.9733309143686503
Accuracy - Gaussian Naive Bayes:  0.9675979680696661
Accuracy - Multinomial Naive Bayes:  0.8988388969521044


### Conclusion: From all the models , KNN has highest accuracy and MultinomialNB has low accuracy.

## SVM

In [264]:
#svm
svm_model = SVC(random_state = 10)
svm_model.fit(X_train, y_train)
y_predict =svm_model.predict(X_test)

#Accuracy calculation
score_svm = metrics.accuracy_score(y_test, y_predict)
print('Accuracy: ', score_svm)

#Performing k fold cross validation with k as 10
s6 = cross_val_score(svm_model , X_train, y_train, cv = 10)
s6.mean()
print('Accuracy using 10-fold cross validation: ', s6.mean())

Accuracy:  0.9485714285714286
Accuracy using 10-fold cross validation:  0.9752539912917271


In [273]:
#Using For Loop to run SVM with Linear, Sigmoid and Poly Kernals

accList = []
for i in ('linear','sigmoid','poly'):
    svm_model2 = SVC(kernel=i,random_state = 42)
    svm_model2.fit(X_train, y_train)
    y_predict = svm_model2.predict(X_test)
    score = metrics.accuracy_score(y_test, y_predict)
    s8 = cross_val_score(svm_model2 , X_train, y_train, cv = 10)
    print(i, ': ', s8.mean())
    accList.append(s8.mean())

print('\nSVM best value found for linear SVM: ', max(accList))

linear :  0.9714078374455732
sigmoid :  0.49408563134978223
poly :  0.961865021770682

SVM best value found for linear SVM:  0.9714078374455732


### Linear kernel performs best using 10-fold cross validation with accuracy of 0.9714078374455732.