**IMPORTING LIBRARIES**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

In [None]:
data.head()

In [None]:
data.shape

**Thus we have 569 rows and 33 columns.**

**Let us check for null values:**

In [None]:
data.isnull().sum()

**When we look at the null values, we see a column named 'Unnamed:32'. If we look at this, we really don't need this
unnamed column. So we can drop that column.**

In [None]:
data.drop(['Unnamed: 32'],axis=1,inplace=True)

In [None]:
data.shape

In [None]:
data['diagnosis'].value_counts()

**Here 357 are Benign and they do not have cancer.**

**212 are Malignant and they have cancer.**

In [None]:
sns.countplot(data['diagnosis'])

In [None]:
sns.pairplot(data,diag_kind='kde')

**Let us look at the data types to see which all columns needed to be encoded.**

In [None]:
data.dtypes

**We see an object here-diagnosis. So we need to encode it.**

**We really don't need the id column, since it is not relevant to know the id of the patient in our case.**

**Encoding the categorical values:**

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()

In [None]:
le.fit_transform(data['diagnosis'])

In [None]:
data['diagnosis']=le.fit_transform(data['diagnosis'])

In [None]:
data.head()

In [None]:
sns.pairplot(data,hue='diagnosis')

**We really don't the id column. It is not significant here. So we can drop the id column from the dataset.**

In [None]:
data.drop(['id'],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
data.plot(kind='box',subplots=True,layout=(6,6),figsize=(20,20))
plt.show()

In [None]:
fig,ax=plt.subplots(1,2,figsize=(12,6))
sns.distplot(data['diagnosis'],ax=ax[0],color='lime')
sns.distplot(data['radius_mean'],ax=ax[1],color='orange')

**Let us look at correlation between the features.**

In [None]:
corr=data.corr()
corr

In [None]:
fig, ax = plt.subplots(figsize=(25,25)) 
sns.heatmap(corr,annot=True,cmap='Greens',ax=ax,fmt='.0%')

In [None]:
data.corr()['diagnosis'].sort_values()

**Model Building**

In [None]:
X=data.drop(['diagnosis'],axis=1)

In [None]:
y=data['diagnosis']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=123)

**Feature Scaling:**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ss=StandardScaler()

In [None]:
X_train=ss.fit_transform(X_train)

In [None]:
X_test=ss.fit_transform(X_test)

**Model 1:**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr=LogisticRegression()

In [None]:
model1=lr.fit(X_train,y_train)

In [None]:
prediction1=model1.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm=confusion_matrix(y_test,prediction1)
cm

In [None]:
TP=cm[0][0]
TN=cm[1][1]
FN=cm[1][0]
FP=cm[0][1]
print('Testing Accuracy:',(TP+TN)/(TP+TN+FN+FP))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test,prediction1)

**Model 2:**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc=DecisionTreeClassifier()

In [None]:
model2=dtc.fit(X_train,y_train)

In [None]:
prediction2=model2.predict(X_test)

In [None]:
confusion_matrix(y_test,prediction2)

In [None]:
accuracy_score(y_test,prediction2)

**Model 3:**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc=RandomForestClassifier()

In [None]:
model3=rfc.fit(X_train,y_train)

In [None]:
prediction3=model3.predict(X_test)

In [None]:
confusion_matrix(y_test,prediction3)

In [None]:
accuracy_score(y_test,prediction3)

In [None]:
from sklearn.metrics import classification_report

In [None]:
classification_report(y_test,prediction1)

In [None]:
classification_report(y_test,prediction2)

In [None]:
classification_report(y_test,prediction3)