In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt

%matplotlib inline
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
df.head()

In [None]:
print(df.shape)

In [None]:
print(df.describe())

In [None]:
X=df[df.columns[2:32]]
y=df[df.columns[1]]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0, stratify=y)


In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
knr = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
svc1 = LinearSVC(random_state=0).fit(X_train,y_train)
svc2 = LinearSVC(C=3, random_state=0).fit(X_train,y_train)
#lr2 and svm2 is with incresed regularization parameter ( C values)

naive_bayes = GaussianNB()
naive_bayes.fit(X_test,y_test)
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train,y_train)
rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)
rand_forest.fit(X_train,y_train)
#lr1 = LogisticRegression(random_state=0).fit(X_train, y_train)
#lr2 = LogisticRegression(C=6, random_state=0).fit(X_train, y_train)

In [None]:
print("train score knr- " + str(knr.score(X_train, y_train)))
print("test score knr- " + str(knr.score(X_test, y_test)))
print("train score svc1- " + str(svc1.score(X_train, y_train)))
print("test score svc1- " + str(svc1.score(X_test, y_test)))
print("train score svc2- " + str(svc2.score(X_train, y_train)))
print("test score svc2- " + str(svc2.score(X_test, y_test)))
print("train score naive_bayes - " + str(naive_bayes.score(X_train, y_train)))
print("test score naive_bayes - " + str(naive_bayes.score(X_test, y_test)))
print("train score dec_tree- " + str(dec_tree.score(X_train, y_train)))
print("test score dec_tree- " + str(dec_tree.score(X_test, y_test)))
print("train score rand_forest- " + str(rand_forest.score(X_train, y_train)))
print("test score rand_forest- " + str(rand_forest.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix

import seaborn as sns
#Confusion Matrix
y_pred = knr.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
#Confusion Matrix
y_pred = svc1.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
#Confusion Matrix
y_pred = svc2.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
#Confusion Matrix
y_pred = naive_bayes.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
#Confusion Matrix
y_pred = dec_tree.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
#Confusion Matrix
y_pred = rand_forest.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
method_names = []
method_scores = []
method_names.append("Decision Tree")
method_scores.append(dec_tree.score(X_test,y_test))
method_names.append("KNN")
method_scores.append(knr.score(X_test,y_test))
method_names.append("SVC1")
method_scores.append(svc1.score(X_test,y_test))
method_names.append("SVC2")
method_scores.append(svc2.score(X_test,y_test))
method_names.append("Naive Bayes")
method_scores.append(naive_bayes.score(X_test,y_test))
method_names.append("Random Forest")
method_scores.append(rand_forest.score(X_test,y_test))

In [None]:
# Comparison between different classifiers
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
#plt.ylim([0.85,1])
plt.bar(method_names,method_scores,width=0.5)
plt.xlabel('Method Name')
plt.ylabel('Method Score')