In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import accuracy_score, confusion_matrix 
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor
from sklearn.decomposition import PCA

In [None]:
df=pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Removing unwanted columns
df.drop(['Unnamed: 32','id'], inplace = True, axis = 1)

In [None]:
print('Data Shape',df.shape)

In [None]:
df.info()

In [None]:
df['diagnosis'].value_counts(normalize=True)

In [None]:
df['diagnosis'].replace({'B':0,'M':1},inplace=True)
df.head()


Correlation matrix will tell us the relationship between *diagnosis* and other features. We will determine a correlation threshhold and everything below it will be dropped.

In [None]:
cor=df.corr()

In [None]:
cor.T

In [None]:
plt.figure(figsize=(25,12))
sns.heatmap(cor,cmap='OrRd', annot=True)
plt.show()

**2 sample t test for insignificant features like fractal_dimension_mean ,if it is effective or not since it has negative coorelation but is not mandatory as Non linear model can handle non linear relationships very well. So we don't bother to check the statistical inference
All are continous features
There is multicollinearity effect, which will be treated using PCA**



# Data Analysis

In [None]:
threshold = 0.75 
f = np.abs(cor["diagnosis"]) > threshold
corr_features = cor.columns[f].tolist()
sns.heatmap(df[corr_features].corr(), annot = True, fmt = ".2f")
plt.title("Correlation Between Features w Corr Theshold 0.75")
plt.show()

# Decision tree and Logistic Regression

In [None]:

from sklearn.model_selection import KFold,cross_val_score

from sklearn import model_selection

In [None]:
x = df.drop(['diagnosis'], axis=1)
y = df['diagnosis']

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression



In [None]:
LR=LogisticRegression()
DT=DecisionTreeClassifier(criterion='entropy', random_state=0)

In [None]:
models=[]
models.append(('Logistic', LR))
models.append(('DecesionTree', DT))

In [None]:
# evaluate each model in turn

results=[]
names=[]
for name,model in models:
    kfold=model_selection.KFold(shuffle=True,n_splits=5,random_state=0)
    cv_result=model_selection.cross_val_score(model,x,y,cv=kfold, scoring='roc_auc')
    results.append(cv_result)
    names.append(name)
    print('%s: %f (%f)' % (name,np.mean(cv_result),np.var(cv_result,ddof=1)))
    
#boxplot alogorithm comparison

fig=plt.figure()
fig.suptitle('Algorithm Comparison')
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
# Now lets check for entropy = gini
LR=LogisticRegression()
DT=DecisionTreeClassifier(criterion='gini', random_state=0)

In [None]:
models=[]
models.append(('Logistic', LR))
models.append(('DecesionTree', DT))

In [None]:
# evaluate each model in turn

results=[]
names=[]
for name,model in models:
    kfold=model_selection.KFold(shuffle=True,n_splits=5,random_state=0)
    cv_result=model_selection.cross_val_score(model,x,y,cv=kfold, scoring='roc_auc')
    results.append(cv_result)
    names.append(name)
    print('%s: %f (%f)' % (name,np.mean(cv_result),np.var(cv_result,ddof=1)))
    
#boxplot alogorithm comparison

fig=plt.figure()
fig.suptitle('Algorithm Comparison')
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

Gini score is increasing the variance. So we will go with entropy.

In [None]:
# KFold scores
results

# Train Test Split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3, random_state=42)


In [None]:
print("X_train",len(x_train))
print("X_test",len(x_test))
print("Y_train",len(y_train))
print("Y_test",len(y_test))

## Standardization

In [None]:
ss=StandardScaler()
x_train=ss.fit_transform(x_train)
x_test=ss.transform(x_test)

# K-Nearest Neighbors Classifier

In [None]:
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
score = knn.score(x_test, y_test)

print("Basic KNN Accuracy: % {}".format(accuracy))
print("Score : ", score)
print("CM : ", cm)

In [None]:
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score,roc_curve,accuracy_score

In [None]:
ypred_train=knn.predict(x_train)
yprob_train=knn.predict_proba(x_train)[:,1]

ypred_test=knn.predict(x_test)
yprob_test=knn.predict_proba(x_test)[:,1]

In [None]:
print('for train KNN')
print('confusion matrix', confusion_matrix(y_train,ypred_train))
print('accuracy score', accuracy_score(y_train,ypred_train))
print('roc_auc score', roc_auc_score(y_train,yprob_train))

print('\n')

print('for test KNN')
print('confusion matrix', confusion_matrix(y_test,ypred_test))
print('accuracy score', accuracy_score(y_test,ypred_test))
print('roc_auc score', roc_auc_score(y_test,yprob_test))

In [None]:
# hyper parameter tuning 

param_dist={'n_neighbors':sp_randint(1,31),
           'p':sp_randint(1,10),
           'weights':['uniform','distance']}
knn=KNeighborsClassifier()
rsearch=RandomizedSearchCV(knn,param_distributions=param_dist,n_iter=25,cv=3,random_state=42,scoring='roc_auc')
rsearch.fit(x_train,y_train)

In [None]:
rsearch.best_params_

In [None]:
rsearch.best_score_

In [None]:
print('for knn')
kf=KFold(shuffle=True,n_splits=5,random_state=42)
auc=cross_val_score(knn,x_train,y_train,cv=kf,scoring='roc_auc')
print('auc:',auc)
print('bias error', 1-auc.mean())
print('variance error:', np.std(auc,ddof=1))

In [None]:
# using tuned values

knn=KNeighborsClassifier(**rsearch.best_params_)
knn.fit(x_train,y_train)

ypred_train=knn.predict(x_train)
yprob_train=knn.predict_proba(x_train)[:,1]

ypred_test=knn.predict(x_test)
yprob_test=knn.predict_proba(x_test)[:,1]

print('for hypertuned  knn train scores')
print('confusion matrix', confusion_matrix(y_train,ypred_train))
print('accuracy score', accuracy_score(y_train,ypred_train))
print('roc_auc score', roc_auc_score(y_train,yprob_train))

print('\n')

print('for hyper tunes test scores')
print('confusion matrix', confusion_matrix(y_test,ypred_test))
print('accuracy score', accuracy_score(y_test,ypred_test))
print('roc_auc score', roc_auc_score(y_test,yprob_test))

In [None]:
print('for knn')
kf=KFold(shuffle=True,n_splits=5,random_state=42)
auc=cross_val_score(knn,x_train,y_train,cv=kf,scoring='roc_auc')

print('auc:',auc)
print('bias error', 1-auc.mean())
print('variance error:', np.std(auc,ddof=1))