In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [None]:
dat = pd.read_csv('data.csv')

In [None]:
dat.head()

In [None]:
dat.isnull().sum()

In [None]:
dat = dat.drop(['id'], axis=1)
dat = dat.drop(['Unnamed: 32'], axis=1)

In [None]:
dat.diagnosis.value_counts()

In [None]:
dat['diagnosis'] = dat['diagnosis'].apply(lambda x: 1 if x=='M' else 0)

In [None]:
Y = dat['diagnosis'].values
X = dat.iloc[:, 2:31].values

In [None]:
random.seed(2022)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

##### standardize variables

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

##### KNN

In [None]:
error = []

# Calculating error for K values between 1 and 40
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, Y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != Y_test))

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate vs K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

Use k=6 to run KNN

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=6)

knn_model.fit(X_train, Y_train)

pred = knn_model.predict(X_test)

In [None]:
sns.heatmap(confusion_matrix(Y_test, pred), annot=True)

In [None]:
print(accuracy_score(Y_test, pred))
print(classification_report(Y_test, pred))

In [None]:
#ROC curve
Y_pred_proba = knn_model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test,  Y_pred_proba)
auc = metrics.roc_auc_score(Y_test, Y_pred_proba)
#create ROC curve
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend()
plt.title('KNN ROC curve')
plt.show()

##### SVM

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear','poly']}
 
grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(X_train, Y_train)

In [None]:
print(grid.best_params_)

In [None]:
#build SVM using best parameters
svm_model = svm.SVC(C=1000, kernel='rbf', gamma=0.0001, probability=True)
svm_model.fit(X_train, Y_train)

In [None]:
pred = svm_model.predict(X_test)

In [None]:
sns.heatmap(confusion_matrix(Y_test, pred), annot=True)

In [None]:
print(accuracy_score(pred, Y_test))
print(classification_report(pred, Y_test))

In [None]:
Y_pred_proba = svm_model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test,  Y_pred_proba)
auc = metrics.roc_auc_score(Y_test, Y_pred_proba)
#create ROC curve
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend()
plt.title('SVM ROC curve')
plt.show()