# IN THIS NOTEBOOK WE ARE GOING TO PREDICT THE TYPE OF CANCER B/M
1. [Getting and preparing the data](#1)
    * [Basic Visualization](#2)
    * [Outlier Detection](#3)
    * [Train Test Split](#4)
1. [BASIC KNN](#5) 
1. [PCA](#6)
    * [Visualization](#7)
    * [Find the Wrong Decision](#8)
1. [Logistic Regression](#9)  
1. [Hyperparameter Tuning](#10)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# LETS FIRST IMPORT LIBRARIES

In [None]:
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier,NeighborhoodComponentsAnalysis,LocalOutlierFactor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

<a id = "1"></a>
# Getting and preparing the Data

In [None]:
data = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

In [None]:
data["diagnosis"].value_counts()

Lets change the "diagnosis" as "type" to understand better.

In [None]:
data.drop(["id","Unnamed: 32"],axis = 1,inplace = True)
data = data.rename(columns = {"diagnosis":"type"})
data["type"] = [1 if i.strip() == "M" else 0 for i in data["type"]]
y = data["type"]
#data.drop(["type"],axis = 1,inplace = True)
data.head()

<a id = "2"></a>
> ### Lets visualize and see the numbers

In [None]:
sns.countplot(data["type"])
print(data["type"].value_counts())

> ### Lets see the correlation

In [None]:
corr_matrix = data.corr()
sns.clustermap(corr_matrix,annot = True,fmt = ".2f",figsize = (15,15))
plt.title("Correlation Between Features")
plt.show()

> ### Above is the each correlation occur between features

## Lets look at another map with a threshold(restriction)

In [None]:
threshold = 0.5
filter1 = np.abs(corr_matrix["type"] > threshold) # Features which have more than 0.75 correlation with 'type'
corr_features = corr_matrix.columns[filter1].tolist()
sns.clustermap(data[corr_features].corr(),annot = True,fmt = ".2f",figsize = (10,10))
plt.title("Correlation Between Features With Threshold 0.75")
plt.show()

> ### Box Plot

In [None]:
#First melt() the data
data_melted = pd.melt(data,id_vars = "type",var_name = "features",value_name = "value")
plt.figure()
sns.boxplot(x = "features",
           y = "value",
           hue = "type",
           data = data_melted)
plt.xticks(rotation = 90)
plt.show()

> ### Pair Plot

In [None]:
sns.pairplot(data[corr_features],diag_kind = "kde",markers = "+",hue = "type")
plt.show()

<a id = "3"></a>
# OUTLIER DETECTION

In [None]:
y = data["type"]
x = data.drop(["type"],axis = 1)
columns = x.columns.tolist()

clf = LocalOutlierFactor()
y_pred = clf.fit_predict(x)
x_score = clf.negative_outlier_factor_

outlier_score = pd.DataFrame()
outlier_score["score"] = x_score

In [None]:
y_pred

### -1's are the outliers

In [None]:
outlier_score.sort_values(by = ["score"],ascending = True).head()

In [None]:
threshold2 = -2.5 
filter2 = outlier_score["score"] < threshold2
outlier_index = outlier_score[filter2].index.tolist()

### Visualize

In [None]:
plt.figure()
plt.scatter(x.iloc[outlier_index,0],x.iloc[outlier_index,1],color = "blue",s = 50, label = "Outliers")
plt.scatter(x.iloc[:,0],x.iloc[:,1],color = "k",s = 3, label = "Data Points")
radius = (x_score.max() - x_score)/(x_score.max() - x_score.min())
plt.scatter(x.iloc[:,0],x.iloc[:,1], s = 1000*radius, edgecolors = "r",facecolors = "none", label = "Outlier Scores")
plt.legend()
plt.show()


### Here we observe a one outlier which is not surprising since we set a -2.5 threshold and we already know there is only one point below this.

In [None]:
#Lets drop the outliers
x = x.drop(outlier_index)
y = y.drop(outlier_index).values

In [None]:
np.abs(corr_matrix["type"] > threshold) # Features which have more than 0.5 correlation with 'type'


<a id = "4"></a>
## We split the data so that we can test it.

y_TEST = y[450:]
y = y[:450]
data_test = data.loc[450:,:]
x = data.loc[:449,:]

### We have created the test datas which we will use at the end.

## Now lets split the data with train_test_split method

In [None]:
test_size = 0.3
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = test_size,random_state = 42)

In [None]:
# STANDARDIZATION 
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train_df = pd.DataFrame(x_train,columns = columns)
x_train_df["type"] = y_train

In [None]:
# Box plot
data_melted = pd.melt(x_train_df,id_vars = "type",var_name = "features",value_name = "value")
plt.figure()
sns.boxplot(x = "features",y = "value",hue = "type",data = data_melted)
plt.xticks(rotation = 90)
plt.show()

In [None]:
#Pair plot
sns.pairplot(x_train_df[corr_features].head(100),diag_kind = "kde",markers = "+",hue = "type")
plt.show()

<a id = "5"></a>
## Basic KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)
score = knn.score(x_test,y_test)
print("Score",score)
print("CM",cm)
print("Basic KNN acc",acc)


In [None]:
"""
[[108   1]
 [  7  55]]
"""

In [None]:
def knn_best_parameters(x_train,x_test,y_train,y_test):
    k_range = list(range(1,31))
    weight_options = ["uniform","distance"]
   
    param_grid = dict(n_neighbors = k_range, weights = weight_options)
    
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn,param_grid,cv = 10,scoring = "accuracy")
    grid.fit(x_train,y_train)
    print("Best training score: {} with parameters: {}".format(grid.best_score_,grid.best_params_))
    
    
    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(x_train,y_train)
    
    y_pred_test = knn.predict(x_test)
    y_pred_train = knn.predict(x_train)
    
    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_train = confusion_matrix(y_train, y_pred_train)
    
    acc_test = accuracy_score(y_test,y_pred_test)
    acc_train = accuracy_score(y_train,y_pred_train)
    
    print("Test Score: {}, Train Score: {}".format(acc_test,acc_train))
    print("CM Test : {}".format(cm_test))
    print("CM Train : {}".format(cm_train))
    
    
    return grid

grid = knn_best_parameters(x_train,x_test,y_train,y_test)

### > We found that the best parameters for training would be n_neighbors = 4, and the weights  = "uniform"

<a id = "6"></a>
## PCA

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

pca = PCA(n_components = 2)
pca.fit(x_scaled)
x_reduced_pca = pca.transform(x_scaled)
pca_data = pd.DataFrame(x_reduced_pca,columns = ["p1","p2"])
pca_data["target"] = y

sns.scatterplot(x = "p1",y = "p2",hue = "target",data = pca_data)
plt.title("PCA: p1 vs p2")

x_train_pca,x_test_pca,y_train_pca,y_test_pca = train_test_split(x_reduced_pca,y,test_size = test_size,random_state = 42)
grid_pca = knn_best_parameters(x_train_pca,x_test_pca,y_train_pca,y_test_pca)



<a id = "7"></a>
## Visualization

In [None]:
cmap_light = ListedColormap(["orange","cornflowerblue"])
cmap_bold = ListedColormap(["darkorange","darkblue"])

h = .05
X = x_reduced_pca
x_min,x_max = X[:,0].min() - 1,X[:,0].max() + 1
y_min,y_max = X[:,1].min() - 1,X[:,1].max() + 1
xx,yy = np.meshgrid(np.arange(x_min, y_max, h),
                  np.arange(y_min, y_max, h))
Z = grid_pca.predict(np.c_[xx.ravel(),yy.ravel()])

# Put the results into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx,yy,Z,cmap = cmap_light)

# Plot also the training points 
plt.scatter(X[:, 0 ],X[:, 1], c= y,cmap = cmap_bold,
           edgecolor = "k",s = 20)
plt.xlim(xx.min(),xx.max())
plt.ylim(yy.min(),yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
         % (len(np.unique(y)),grid_pca.best_estimator_.n_neighbors,grid_pca.best_estimator_.weights))





In [None]:
nca = NeighborhoodComponentsAnalysis(n_components = 2,random_state= 42)
nca.fit(x_scaled,y) #Supervised learning which needs y 
x_reduced_nca = nca.transform(x_scaled)
nca_data = pd.DataFrame(x_reduced_nca,columns = ["p1","p2"])
nca_data["target"] = y
sns.scatterplot(x = "p1", y = "p2", hue = "target", data = nca_data)
plt.title("NCA: p1 vs p2")

In [None]:
x_train_nca, x_test_nca,y_train_nca,y_test_nca = train_test_split(x_reduced_nca,y,test_size = test_size, random_state = 42)

grid_nca = knn_best_parameters(x_train_nca, x_test_nca,y_train_nca,y_test_nca)

In [None]:
cmap_light = ListedColormap(['orange',  'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'darkblue'])

h = .4 # step size in the mesh
X = x_reduced_nca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_nca.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_nca.best_estimator_.n_neighbors, grid_nca.best_estimator_.weights))


<a id = "8"></a>
## Find the wrong decision

In [None]:
knn = KNeighborsClassifier(**grid_nca.best_params_)
knn.fit(x_train_nca,y_train_nca)
y_pred_nca = knn.predict(x_test_nca)
acc_test_nca = accuracy_score(y_pred_nca,y_test_nca)
knn.score(x_test_nca,y_test_nca)

test_data = pd.DataFrame()
test_data["x_test_nca_p1"] = x_test_nca[:,0]
test_data["x_test_nca_p2"] = x_test_nca[:,1]
test_data["y_pred_nca"] = y_pred_nca
test_data["y_test_nca"] = y_test_nca

plt.figure()
sns.scatterplot(x = "x_test_nca_p1",y = "x_test_nca_p2",hue = "y_test_nca",data = test_data)

diff = np.where(y_pred_nca!=y_test_nca)[0]
plt.scatter(test_data.iloc[diff,0],test_data.iloc[diff,1],label = "Wrong Classified",alpha = 0.2,color = "red",s = 1000)



<a id = "9"></a>
# Logistic Regression 

In [None]:
lr = LogisticRegression()
lr.fit(x_train,y_train)
print("Training Acc",round(lr.score(x_train,y_train)*100,2))
print("Test acc",round(lr.score(x_test,y_test)*100,2))

<a id = "10"></a>
# HYPERPARAMETER TUNING 
* Decisiontree
* SVM
* Random Forest
* KNN
* Logistic Regression


In [None]:
random_state = 42
classifier = [DecisionTreeClassifier(random_state = random_state),
             SVC(random_state = random_state),
             RandomForestClassifier(random_state = random_state),
             LogisticRegression(random_state = random_state),
             KNeighborsClassifier()]
dt_grid = {"min_samples_split":range(10,500,20),
          "max_depth":range(1,20,2)}

svc_grid = {"kernel":["rbf"],
           "gamma":[0.001,0.01,0.1,1],
           "C": [1,10,50,100,200,300,1000]}

rf_grid = {"max_features":[1,3,10],
          "min_samples_split":[2,3,10],
          "min_samples_leaf":[1,3,10],
          "bootstrap":[False],
          "n_estimators":[100,300],
          "criterion":["gini"]}

lr_grid = {"C":np.logspace(-3,3,7),
          "penalty":["l1","l2"]}

knn_grid = {"n_neighbors":np.linspace(1,19,10,dtype = int),
           "weights":["uniform","distance"],
           "metric":["euclidean","manhattan"]}

classifier_param = [dt_grid,
                   svc_grid,
                   rf_grid,
                   lr_grid,
                   knn_grid]


In [None]:
cv_results = []
best_estimators = []
for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i],param_grid = classifier_param[i],cv = StratifiedKFold(n_splits = 10),
                       scoring = "accuracy", n_jobs = -1,verbose = 1)
    clf.fit(x_train,y_train)
    cv_results.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_results[i])
    

In [None]:

cv_results = pd.DataFrame({"Cross Validation Means":cv_results, "ML Models":["DecisionTreeClassifier", "SVM","RandomForestClassifier",
             "LogisticRegression",
             "KNeighborsClassifier"]})



In [None]:
g = sns.barplot("Cross Validation Means", "ML Models", data = cv_results)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross Validation Scores")