In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling as pp
import seaborn as sns

#import warnings library
import warnings
# ignore all warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
df.head()

In [None]:
print(df.shape)
df.columns.unique()

In [None]:
# 1. Id and Unnamed: 32 column does not help in classification
# 2. Diagnosis column is our class label


y = df['diagnosis']            #includes our labels
list = ["id","diagnosis","Unnamed: 32"]
x = df.drop(list,axis=1)       #includes our features
x.head()

In [None]:
ax=sns.countplot(y,label="Count")
B,M = y.value_counts()
print("Benign=",B)
print("Malignant=",M)

In [None]:
x.info()
#with the help of this we confirm that there no null values in the dataset.

In [None]:
x.describe().round(2).T

In [None]:
# Since the difference in the range of mean is very high. 
# Normalisation/standarisation is required before model selection and plotting graphs like violinplot and swarmplot.

# standarisation of data
data_dia = y
data = x
data_norm = (data - data.mean())/(data.std())
data_norm.describe().round(4).T

In [None]:
#Plotting violinplot
def violin_plot(data):
    plt.figure(figsize=(10,10))
    sns.violinplot(y="features", x="value", hue="diagnosis", data=data,split=True, inner="quart")
    plt.xticks(rotation=90)
    plt.show()
    
#Plotting swarmplot
def swarm_plot(data):
    plt.figure(figsize=(10,10))
    sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# Ploting first 10 features to analyse
data = pd.concat([y,data_norm.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="diagnosis",var_name="features",value_name="value")

violin_plot(data)

swarm_plot(data)
# From swarmPlot, we get to know that most of the feature are not good for classification.
# Like texture_mean, smoothness_mean, compactness_mean and fractal_dimension_mean are not do to classify.

In [None]:
# Ploting next 10 features to analyse
data = pd.concat([y,data_norm.iloc[:,10:20]],axis=1)
data = pd.melt(data,id_vars="diagnosis",var_name="features",value_name="value")

violin_plot(data)

swarm_plot(data)
# From swarmPlot, we get to know that feature like texture_se, fractal_dimension_se, symmetry_se 
# and smoothness_se are not good for classification.

In [None]:
# Ploting next 10 features to analyse
data = pd.concat([y,data_norm.iloc[:,20:31]],axis=1)
data = pd.melt(data,id_vars="diagnosis",var_name="features",value_name="value")

violin_plot(data)

swarm_plot(data)

# From swarmPlot, we get to know that feature like texture_worst, fractal_dimension_worst, symmetry_worst 
# and smoothness_worst are not good for classification.

In [None]:
#Represents the heatmap of the correlation between all the columns
# This is done in order to know and get good features for prediction.

plt.figure(figsize=(20,12))
sns.heatmap(data_norm.corr(),annot=True)
plt.show()

**Feature Seletion:**

Below feature selection is being performed from the knowledge gained from correlation heatmap.
There are two list keeping list and dropping list.

**keepinglist** :- List of features that are being kept and used for model prediction.

**droplist** :- List of features that does not seem helpful in better prediction.

In [None]:
#keepinglist=[fractal_dimension_mean,texture_mean,radius_mean,symmetry_mean,concave points_mean,
#            texture_se,symmetry_se,smoothness_se,fractal_dimension_se,area_se,concave points_se,
#            area_worst,smoothness_worst,concavity_worst,symmetry_worst]

droplist=["perimeter_mean","area_mean","compactness_mean","concavity_mean",
          "perimeter_se","radius_se","compactness_se","concavity_se",
          "radius_worst","perimeter_worst","texture_worst","concave points_worst","compactness_worst","fractal_dimension_worst"]

x_1 = x.drop(droplist,axis=1)
x_1.head()

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(x_1.corr(),annot=True, linewidths=.5)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score

# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(x_1, y, test_size=0.3, random_state=42)

#random forest classifier with n_estimators=10 (default)
clf_rf = RandomForestClassifier(random_state=43)      
clr_rf = clf_rf.fit(x_train,y_train)

y_pred = clf_rf.predict(x_test)

cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,fmt="d")
print('True Positives :', cm[0][0])
print('False Positives :', cm[0][1])
print('False Negatives :', cm[1][0])
print('True Negatives :', cm[1][1], '\n')

In [None]:
ac = accuracy_score(y_test,clf_rf.predict(x_test))
print('Accuracy is: ',ac)