# <center> Breast Cancer Evaluation </center>

## Aim

The aim of this research is to predict the intensity of the cancer nased ons given medical parameters. We will be using 6 ML models and the best of these will be chosen

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
from sklearn.metrics import accuracy_score,classification_report

%matplotlib inline
warnings.filterwarnings('ignore')

## Importing the data

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
data.head()

## Checking the data

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.columns

In [None]:
sns.heatmap(data.isnull())

Now we knwo that we dont have any null values

## Data Cleaning

### Dropping id and unnamed columns

In [None]:
data =data.drop("Unnamed: 32",axis=1)
data =data.drop('id',axis=1)

### Converting cancer type to boolean

In [None]:
data = pd.concat([pd.get_dummies(data['diagnosis']),data],axis=1)
data = data.drop('diagnosis',axis=1)
data = data.drop('B',axis=1)
data.head()

#### Note: A value of 1 in the M column represents a malignant cancer

### Correlation Coeffs

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(data.corr(method='spearman'),annot=True)

In [None]:
data.corr(method='spearman')['M'].sort_values().tail()

In [None]:
data.corr(method='spearman')['radius_mean'].drop('radius_mean',axis=0).sort_values().tail()

#### Note: here we see that the area, perimeter and radius are highly correlated. In essence this could be due to the fact that they are mathematical fns of each other. We will thus eliminate them to make the model more efficient

In [None]:
data = data.drop(['perimeter_mean','area_mean','perimeter_worst', 'area_worst','perimeter_se', 'area_se'],axis=1)

In [None]:
data.corr(method='spearman')['radius_mean'].drop(['radius_mean','M'],axis=0).sort_values().tail(1)

In [None]:
data.corr(method='spearman')['smoothness_mean'].drop(['smoothness_mean','M'],axis=0).sort_values().tail(1)

In [None]:
data.corr(method='spearman')['texture_mean'].drop(['texture_mean','M'],axis=0).sort_values().tail(1)

In [None]:
#data = data.drop(['radius_worst', 'texture_worst','smoothness_worst', 'compactness_worst', 'concavity_worst','concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'],axis=1)

Now what we have is a much simpler and effective dataset

## EDA

In [None]:
sns.pairplot(data,hue='M',palette = 'tab10')

In [None]:
plt.figure(figsize=(10,10))
fig = px.scatter_3d(data, x='radius_mean', y='texture_mean', z='concavity_mean',color='M')
fig.show()

In [None]:
plt.figure(figsize=(10,8),num=1)
plt.subplot(2,2,1)
sns.scatterplot(x='radius_mean',y='texture_mean',hue='M',palette='CMRmap',data=data)
plt.subplot(2,2,2)
sns.scatterplot(x='radius_mean',y='concavity_mean',hue='M',palette='CMRmap',data=data)
plt.subplot(2,2,3)
sns.scatterplot(x='fractal_dimension_mean',y='texture_mean',hue='M',palette='CMRmap',data=data)
plt.subplot(2,2,4)
sns.scatterplot(x='fractal_dimension_mean',y='concavity_mean',hue='M',palette='CMRmap', data=data)

In [None]:
plt.figure(figsize=(10,8),num=1)
plt.subplot(2,2,1)
sns.boxplot(x='M',y='concave points_mean',palette='CMRmap',data=data)
plt.subplot(2,2,2)
sns.boxplot(x='M',y='symmetry_mean',palette='CMRmap',data=data)
plt.subplot(2,2,3)
sns.boxplot(x='M',y='smoothness_mean',palette='CMRmap',data=data)
plt.subplot(2,2,4)
sns.boxplot(x='M',y='compactness_mean',palette='CMRmap', data=data)

## Data Split

In [None]:
X = data.drop('M',axis=1)
Y = data['M']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=54)

## Model Creation

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
accuracy = []
for i in range(1,200):    
    kn = KNeighborsClassifier(n_neighbors=i)
    kn.fit(X_train,Y_train)
    predK = kn.predict(X_test)
    accuracy.append([accuracy_score(Y_test,predK),i])
temp = accuracy[0]
for m in accuracy:
    if temp[0] < m[0]:
        temp=m
kno = KNeighborsClassifier(n_neighbors=temp[1])

In [None]:
from sklearn.svm import SVC
svc = SVC()

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100, 1000,2000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
accuracy = []
models = [lr,rfc,knn,kno,svc,grid]
for m in models:
    m.fit(X_train,Y_train)
    prediction = m.predict(X_test)
    accuracy.append([m,accuracy_score(Y_test,prediction)])

In [None]:
temp = accuracy[0]
for m in accuracy:
    if temp[1]<m[1]:
        temp=m
print(temp)

### The best method is thus a Logistic regression as it gives us an accuracy of 97%

In [None]:
print(classification_report(Y_test,temp[0].predict(X_test)))
print("Accuracy of this model: ",round(accuracy_score(Y_test,temp[0].predict(X_test))*100,2),'%')