In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
#import necessary libraries.
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv', index_col='id')

Let us know some information about this data in order to understand it.

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#drop unimportant column
df.drop(columns='Unnamed: 32',inplace=True)

In [None]:
# knowing more information about numerical data.
df.describe().T

The dataset gives information about tumor features, that are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. For each observation there are 10 features, which describe tumor size, density, texture, symmetry, and other characteristics of the cell nuclei 

The mean, standard error and "worst" mean (mean of the three largest values) of these features were computed for each image, resulting in 30 features. The categorical target feature indicates the type of the tumor.
for more information https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29?fbclid=IwAR1pkfU-Pp-y090BgJIpHAtOGJ9mf_RfUARNKzZ-LqFRyeLvul4R4nAgzbE

In [None]:
#  So I will look at the diagnosis column and divide the data into three parts.
df['diagnosis'].value_counts(normalize=True)
# These data tell us that there are 62% of cases benign and 37% of cases are Malignant.

In [None]:
# now we will convert this column to categorical type.
df['diagnosis']=df['diagnosis'].astype('category',inplace=True)

In [None]:
df.info()

In [None]:
# it is time to divide data.
tumor_mean = df[['diagnosis',             
'radius_mean'           
,'texture_mean'          
,'perimeter_mean'        
,'area_mean'             
,'smoothness_mean'       
,'compactness_mean'      
,'concavity_mean'        
,'concave points_mean'   
,'symmetry_mean'         
,'fractal_dimension_mean']]
# standard erro
tumor_ste = df[['diagnosis','radius_se'             
,'texture_se'            
,'perimeter_se'          
,'area_se'               
,'smoothness_se'         
,'compactness_se'        
,'concavity_se'          
,'concave points_se'    
,'symmetry_se'           
,'fractal_dimension_se']]
tumor_worst =df[['diagnosis','radius_worst'           
,'texture_worst'          
,'perimeter_worst'        
,'area_worst'             
,'smoothness_worst'       
,'compactness_worst'      
,'concavity_worst'        
,'concave points_worst'   
,'symmetry_worst'         
,'fractal_dimension_worst']]

the data after divided

In [None]:
tumor_worst.head()

In [None]:
tumor_ste.head()

In [None]:
tumor_mean.head()

# Preprocessing 

### working in tumor_mean first 


In [None]:
# change columns name 
col=tumor_mean.columns.str.split('_').str[0]
tumor_mean.columns=col

In [None]:
tumor_mean.head()

In [None]:
all_t=tumor_mean.groupby('diagnosis').sum()
all_t.style.background_gradient(cmap='Blues',subset=["radius"])\
                        .background_gradient(cmap='Reds',subset=["texture"])\
                        .background_gradient(cmap='Greens',subset=["perimeter"])\
                        .background_gradient(cmap='Purples',subset=["area"])\
                        .background_gradient(cmap='Pastel1_r',subset=["smoothness"])\
                        .background_gradient(cmap='YlOrBr',subset=["compactness"])\
                        .background_gradient(cmap='Pastel1_r',subset=["concavity"])\
                        .background_gradient(cmap='Blues',subset=["concave points"])\
                        .background_gradient(cmap='Reds',subset=["symmetry"])\
                        .background_gradient(cmap='Greens',subset=["fractal"])\

when we Looking , we find the sum of the radius, texture, perimeter, smoothness,symmetry and fractal
We find it greater, but this is not accurate information because the number of polyps is greater, but in
Area, compactness and concave points
Despite the small number of malignancies, this indicates that they have a significant impact on the type of disease

In [None]:
tumor_mean['diagnosis'].value_counts()

I will separate the data that contain the type of malignant disease and the data that contain the type of benign disease

In [None]:
B_index=tumor_mean[tumor_mean['diagnosis']=='B'].index

In [None]:
B_tu=tumor_mean[tumor_mean['diagnosis']=='B']

In [None]:
B_tu=pd.DataFrame(B_tu)

In [None]:
B_tu.head()

In [None]:
M_ = tumor_mean.drop(index=B_index)

In [None]:
M_.head()

Now I will identify an equal number of malignant and benign disease in order to find accurate values

In [None]:
M_B=pd.concat([B_tu.iloc[0:212,:],M_])

In [None]:
M_B.shape

In [None]:
all_t=M_B.groupby('diagnosis').sum()
all_t.style.background_gradient(cmap='Blues',subset=["radius"])\
                        .background_gradient(cmap='Reds',subset=["texture"])\
                        .background_gradient(cmap='Greens',subset=["perimeter"])\
                        .background_gradient(cmap='Purples',subset=["area"])\
                        .background_gradient(cmap='Pastel1_r',subset=["smoothness"])\
                        .background_gradient(cmap='YlOrBr',subset=["compactness"])\
                        .background_gradient(cmap='Pastel1_r',subset=["concavity"])\
                        .background_gradient(cmap='Blues',subset=["concave points"])\
                        .background_gradient(cmap='Reds',subset=["symmetry"])\
                        .background_gradient(cmap='Greens',subset=["fractal"])\

Now we can see that the higher these values, the greater their effect on malignant disease. We can notice that there is a big difference in
[radius,area,compactness,concavity,concave points]
Also, there is a convergence of values in fractal 

# Visualization 

In [None]:
tumor_mean.head()

In [None]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

In [None]:
# For Notebooks
init_notebook_mode(connected=True)

In [None]:
# For offline use
cf.go_offline()

In [None]:
sns.countplot(x='diagnosis',data=tumor_mean)

In [None]:
tumor_mean['radius'].iplot(kind='hist')
#most radius btween 10 to 15

In [None]:
tumor_mean.iplot(kind='scatter',x='radius',y='texture',mode='markers',categories='diagnosis',size=10,xTitle='radius',yTitle='texture',title='The relationship of the texture to the type of tumor and radius')


We can see that the texture does not affect the type of disease, but we have noticed that when the diameter of the tumor is greater than 20, this indicates that the type of tumor is malignant.

In [None]:
tumor_mean.iplot(kind='scatter',x='radius',y='concavity',mode='markers',categories='diagnosis',size=10,xTitle='radius',yTitle='concavity',title='The relationship of the concavity to the type of tumor and radius')


We can see that concavity for adenoma is constant 0 to 0.15
As for the malignant disease, it is random and irregular

In [None]:
tumor_mean.iplot(kind='scatter',x='radius',y='compactness',mode='markers',categories='diagnosis',size=10,xTitle='radius',yTitle='compactness',title='The relationship of the compactness to the type of tumor and radius')


Most adenomas are between 0 and 1.7 in contrast to randomly spreading malignancies

In [None]:
tumor_mean.head()

In [None]:
tumor_mean[['radius','texture']].iplot(kind='box')

In [None]:
tumor_mean[['area']].iplot(kind='box')

In [None]:
plt.figure(figsize=(30,10))
plt.subplot(2,3,1)
sns.boxplot(x='smoothness',data=tumor_mean)
plt.figure(figsize=(30,10))
plt.subplot(2,3,2)
sns.boxplot(x='concavity',data=tumor_mean)
plt.figure(figsize=(30,10))
plt.subplot(2,3,3)
sns.boxplot(x='compactness',data=tumor_mean)
plt.figure(figsize=(30,10))
plt.subplot(2,3,4)
sns.boxplot(x='concave points',data=tumor_mean)
plt.figure(figsize=(30,10))
plt.subplot(2,3,5)
sns.boxplot(x='symmetry',data=tumor_mean)
plt.figure(figsize=(30,10))
plt.subplot(2,3,6)
sns.boxplot(x='fractal',data=tumor_mean)


From this we can see the centralization of data and  we discover that there is outliers

In [None]:
plt.figure(figsize=(30,10))

sns.boxplot(x='radius',data=tumor_mean)
plt.figure(figsize=(30,10))

sns.boxplot(x='texture',data=tumor_mean)
plt.figure(figsize=(30,10))

sns.boxplot(x='perimeter',data=tumor_mean)
plt.figure(figsize=(30,10))

sns.boxplot(x='area',data=tumor_mean)


# working with outliers

In [None]:
from pandas.api.types import is_numeric_dtype
def remove_outlier(df):
    low = .05
    high = .95
    quant_df = df.quantile([low, high])
    for name in list(df.columns):
        if is_numeric_dtype(df[name]):
            df = df[(df[name] > quant_df.loc[low, name]) & (df[name] < quant_df.loc[high, name])]
    return df

data=remove_outlier(tumor_mean)

In [None]:
data.shape

# now let us see the distribution after delete outliers

In [None]:
plt.figure(figsize=(15,15))
plt.subplot(2,5,1)
sns.violinplot(x='area',y='diagnosis',data=data)
plt.subplot(2,5,2)
sns.violinplot(x='radius',y='diagnosis',data=data)
plt.subplot(2,5,3)
sns.violinplot(x='perimeter',y='diagnosis',data=data)
plt.subplot(2,5,4)
sns.violinplot(x='smoothness',y='diagnosis',data=data)
plt.subplot(2,5,5)
sns.violinplot(x='compactness',y='diagnosis',data=data)
plt.subplot(2,5,6)
sns.violinplot(x='concavity',y='diagnosis',data=data)
plt.subplot(2,5,7)
sns.violinplot(x='concave points',y='diagnosis',data=data)
plt.subplot(2,5,8)
sns.violinplot(x='symmetry',y='diagnosis',data=data)
plt.subplot(2,5,9)
sns.violinplot(x='fractal',y='diagnosis',data=data)


Now we can answer some questions, such as how to determine the type of disease
1- We can see that the area of ​​the malignant tumor ranges between 0 to 700 in most cases, but the malignancy exceeds that
2- The text of the diameter of the adenoma between 10 and 15, unlike the malignant tumor that exceeds that
3- The circumference of the benign tumor is between 50 to 100, and if it increases, it will become malignant
4- Concave adenoma from 0 to 0.1 in benign tumor
5- Concave point in the adenoma between 0.05

In [None]:
plt.figure(figsize=(10,10))
sns.clustermap(data.corr(),linewidths=2,annot=True)

now we can observed 
the radius, perimeter and area are highly correlated as expected from their relation so from these we will use anyone of them
compactness, concavity and concavepoint are highly correlated so we will use compactness from here
so selected Parameter for use is perimeter, texture, compactness, symmetry*

In [None]:
data=data[["diagnosis",'perimeter','texture','compactness', 'symmetry']]

In [None]:
data.iloc[:,1:].melt().groupby('variable').sum()

In [None]:
data.head()

In [None]:
data['diagnosis']=data['diagnosis'].replace('M',1)
data['diagnosis']=data['diagnosis'].replace('B',0)

In [None]:
data.head()

now i will doing feature Scaling to measure spread of the data

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
data_f = sc_X.fit_transform(data)

In [None]:
data_f= pd.DataFrame(data_f)

In [None]:
data_f.head()

In [None]:
data_f.columns=[["diagnosis",'perimeter','texture','compactness', 'symmetry']]

In [None]:
data_f.head()

In [None]:
data_f[['texture','perimeter']].iplot(kind='spread',title='spread of texture and perimeter')

In [None]:
data_f[['compactness','symmetry']].iplot(kind='spread',title='spread of compactness and symmetry')

In [None]:
data_f.head()