**Importing Libraries**

---



---



In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
%matplotlib inline

**Loading Dataset**

---



---



In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.head(10)

In [None]:
df.shape

In [None]:
df.describe()

**Data Preprocessing**

---



---



"Unnamed: 32" column has no affect on our dataset, so let's drop that column

In [None]:
df = df.drop(['Unnamed: 32'],axis=1)

Check if theres null value or not

In [None]:
df.isnull().sum()

Additionally, we can visualize to check null values by using two method

*Method 1*

In [None]:
sb.heatmap(df.isnull(), cbar=False)

*Method 2*

In [None]:
msno.matrix(df)

`Feature Selection`

*Feature Selection is the process where you automatically or manually select those features which contribute most to your prediction variable or output in which you are interested in. Having irrelevant features in your data can decrease the accuracy of the models and make your model learn based on irrelevant features.*

In [None]:
x = pd.DataFrame(df.iloc[:,2:])
x

In [None]:
y = pd.DataFrame(df.iloc[:,1])

After separating the feature variables, now let's apply feature selection

In [None]:
fit_feat = SelectKBest(score_func=f_classif)

In [None]:
fit_feat.fit(x,y)

In [None]:
score_col = pd.DataFrame(fit_feat.scores_, columns=['Score Values'])

In [None]:
score_col

In [None]:
name_col = pd.DataFrame(x.columns)

In [None]:
feat = pd.concat([name_col,score_col],axis=1)

In [None]:
feat

1.   fractal_dimension_mean
2.   texture_mean
3.   smoothness_se 
4.   symmetry_se
5.   fractal_dimension_se

Here we got some features that has no significant affect on our data, so let's drop them all

In [None]:
x = x.drop(['fractal_dimension_mean', 'texture_mean','smoothness_se','symmetry_se','fractal_dimension_se'],axis=1)

In [None]:
x

`Label Encoding`

In [None]:
label = LabelEncoder()

In [None]:
y = label.fit_transform(y)

**Exploratory Data Analysis**

---



---



In [None]:
sb.countplot(data = df, x="diagnosis", palette=["#A439A1","#43A5D1"], edgecolor="black", lw=3)

In [None]:
sb.histplot(df,x="radius_mean", hue="diagnosis",multiple="stack",palette="crest_r",edgecolor="black",lw=1.5,bins=40,log_scale=True)

In [None]:
fig,ax=plt.subplots(2,2,figsize=(8,14))
sb.scatterplot(x='perimeter_mean',y='radius_worst',data=df,hue='diagnosis',ax=ax[0][0])
sb.scatterplot(x='area_mean',y='radius_worst',data=df,hue='diagnosis',ax=ax[1][0])
sb.scatterplot(x='texture_mean',y='texture_worst',data=df,hue='diagnosis',ax=ax[0][1])
sb.scatterplot(x='area_worst',y='radius_worst',data=df,hue='diagnosis',ax=ax[1][1])
plt.show()

In [None]:
sb.scatterplot(x = "texture_mean", y = "area_mean",data=df,size='diagnosis',hue='diagnosis',palette='hot')
sb.set(style='whitegrid',)
plt.show()

In [None]:
sb.scatterplot(x = "compactness_mean", y = "area_mean",data=df,size='diagnosis',hue='diagnosis',palette='dark')
sb.set(style='whitegrid',)
plt.show()

In [None]:
sb.relplot(data=df, kind="line",
    x="area_mean", y="area_worst",
    hue="diagnosis", size="diagnosis",
    facet_kws=dict(sharex=False)
)

In [None]:
sb.catplot(data=df, kind="violin", x="concavity_mean", y="concavity_worst", hue="diagnosis", split=True)

In [None]:
fig, ax = plt.subplots(figsize=(20,15)) 
sb.heatmap(df.corr(),ax=ax,annot=True,linewidth=.5, cmap='icefire')

**Train Test Split**

---



---



In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size=.86, random_state=252)

Standard Scaling

In [None]:
sc = StandardScaler()

In [None]:
xtrain = sc.fit_transform(xtrain)

In [None]:
xtest = sc.fit_transform(xtest)

**Random Forest Model**

---



---



In [None]:
rfc = RandomForestClassifier(n_estimators=200)

In [None]:
rfc.fit(xtrain,ytrain)

In [None]:
ypred = rfc.predict(xtest)

In [None]:
accuracy_score(ypred, ytest)

In [None]:
sb.heatmap(confusion_matrix(ypred,ytest),annot=True, cmap='binary')

In [None]:
print(classification_report(ypred,ytest))

In [None]:
plt.plot(ypred, ytest, color='r')
plt.xlabel("Predicted Value")
plt.ylabel("Tested Value")
plt.title("Accuracy Line")