In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
pd.set_option('display.max_columns',None)
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv('/kaggle/input/breastcancerdataset/BRCA.csv',index_col='Patient_ID')

In [None]:
train.head(5)

In [None]:
plt.figure(figsize=(12,5))
train.boxplot()

In [None]:
train.describe().T

In [None]:
train.info()

In [None]:
train.shape

In [None]:
train.isnull().sum()

In [None]:
train[train['Gender'].isna()]

Since last 7 rows have all null values. We will remove it.

In [None]:
train=train[:-7]

In [None]:
train.isnull().sum()

Patients_Status shoul not depend on the date of  their last visit. so, we will delete this column.

In [None]:
train[train['Date_of_Last_Visit'].isna()]

In [None]:
train.drop('Date_of_Last_Visit',axis=1,inplace=True)

In [None]:
train['Date_of_Surgery']=pd.to_datetime(train['Date_of_Surgery'])

In [None]:
num_feat= train.select_dtypes(include='number').columns.tolist()
cat_feat= train.select_dtypes(include='object').columns.tolist()

In [None]:
print(num_feat)
print(cat_feat)

In [None]:
from scipy.stats import norm

In [None]:
plt.figure(figsize=(16,12))
for i,feat in enumerate(num_feat):
    plt.subplot(3,2,i+1)
    sns.distplot(train[feat],fit=norm)
    (mu,sigma)=norm.fit(train[feat])
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f})'.format(mu,sigma)],loc='best')
    plt.tight_layout()
    plt.title(feat)

#### Observations

1. All numerical variables have bell-shaped distribution.
2. Age has mean around 60.
3. Protein1,Protein3,Protein4 have mean around 0.

In [None]:
train['Age'].mean()

In [None]:
plt.figure(figsize=(16,17))
for i,feat in enumerate(cat_feat):
    plt.subplot(4,3,i+1)
    sns.countplot(x=feat,data=train,hue='Patient_Status')
    plt.tight_layout()
    plt.title(feat)

In [None]:
train.isnull().sum()

In [None]:
train[train['Patient_Status'].isna()]

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(x=train['Patient_Status'],y=train['Age'],hue=train['Tumour_Stage'])

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(x=train['Patient_Status'],y=train['Age'],hue=train['Surgery_type'])

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(x=train['Patient_Status'],y=train['Age'],hue=train['Histology'])

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(x=train['Patient_Status'],y=train['Age'],hue=train['Gender'])

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(y=train['Protein1'],x=train['Patient_Status'],hue=train['Surgery_type'])

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(x=train['Patient_Status'],y=train['Age'],hue=train['HER2 status'])

Since, we can clearly see that for 'Infiltrating Ductal Carcinoma' Histology type Patient Status is 'ALive'.So we, can fill NAN values with 'Alive'.

In [None]:
train['Patient_Status'].fillna(train['Patient_Status'].mode()[0],inplace=True)

# Feature Engineering

In [None]:
sns.set_palette(sns.color_palette('Set1',8))
sns.pairplot(train,hue='Patient_Status',corner=True)

There is no clear direct relationship between numerical columns.

### Detailed Exploration of Data.

#### Univariate Analysis

In [None]:
def dist_box(data):
   # function plots a combined graph for univariate analysis of continous variable 
 #to check spread, central tendency , dispersion and outliers .
    Name=data.name.upper()
    fig,(ax_box,ax_dis)= plt.subplots(nrows=2,sharex=True,
                                     gridspec_kw={"height_ratios":(.25,.75)},figsize=(8,5))
    mean=data.mean()
    median=data.median()
    mode=data.mode().tolist()[0]
    sns.set_theme(style='white')
    sns.set_palette(sns.color_palette('Set1',8))
    fig.suptitle('SPREAD OF DATA FOR '+ Name ,fontsize=18,fontweight='bold')
    sns.boxplot(x=data,showmeans=True,orient='h',ax=ax_box)
    ax_box.set(xlabel='')
    sns.set_palette(sns.color_palette('Set1',8))
    sns.distplot(data,kde=False,ax=ax_dis)
    ax_dis.axvline(mean,color='r',linestyle='--',linewidth=2)
    ax_dis.axvline(median,color='g',linestyle='-',linewidth=2)
    ax_dis.axvline(mode,color='y',linestyle='-',linewidth=2)
    plt.legend({'Mean':mean,'Median':median,'Mode':mode})

In [None]:
for i in range(len(num_feat)):
    dist_box(train[num_feat[i]])

##### Observations on Class

In [None]:
title=cat_feat
plt.figure(figsize=(13,16))
sns.set_theme(style='white')

for i,feat in enumerate(cat_feat):
    plt.subplot(4,2,i+1)
    order=train[feat].value_counts(ascending=False).index
    sns.set_palette('Set1')
    ax=sns.countplot(x=train[feat],data=train,hue=train['Patient_Status'])
    sns.despine(top=True,right=True,left=True)
    for p in ax.patches:
        percentage='{:.1f}%'.format(100*p.get_height()/len(train[feat]))
        x=p.get_x()+p.get_width()/2-0.05
        y=p.get_y() +p.get_height()
        plt.annotate(percentage,(x,y),ha='center')
    plt.tight_layout()
    plt.title(title[i].upper())

#### Bivariate Analysis

In [None]:
sns.set_palette(sns.color_palette('Set1',8))
plt.figure(figsize=(15,7))
sns.heatmap(train.corr(),annot=True,vmin=-1,vmax=1,cmap=plt.cm.Blues)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
for i,feat in enumerate(num_feat):
    plt.subplot(2,3,i+1)
    sns.boxplot(x=train['Patient_Status'],y=train[feat],data=train)
    sns.despine(top=True,right=True,left=True)
    plt.tight_layout()
    plt.title(feat.upper())

In [None]:
df=train.copy()

In [None]:
df['Age']

In [None]:
df['Age']=pd.cut(df['Age'],bins=3)

In [None]:
ls=df[df['Patient_Status']=='Dead']['Age']
type(ls)

In [None]:
df['Age']=pd.cut(df['Age'],bins=3)

In [None]:
df['Age'].value_counts()

In [None]:
data1=[86,177,71]
Age=['Young','Old','Very Old']
colors=('orange','grey','brown')
explode=(0.1,0,0.3)
wp={'linewidth':1, 'edgecolor':'green'}
def func(pct,allvalues):
    absolute=int(pct/100.*np.sum(allvalues))
    return "{:.1f}%\n({:d})".format(pct, absolute)

fig,ax= plt.subplots(figsize=(10,7))
wedges, texts, autotexts = ax.pie(data1, 
                                  autopct = lambda pct: func(pct, data1),
                                  explode = explode, 
                                  labels = Age,
                                  shadow = True,
                                  colors = colors,
                                  startangle = 90,
                                  wedgeprops = wp,
                                  textprops = dict(color ="white"))
ax.legend(wedges, Age,
          title ="Deaths as per Age",
          loc ="center left",
          bbox_to_anchor =(1, 0, 0.5,1))
  
plt.setp(autotexts, size = 8, weight ="bold")
ax.set_title("Distribution of Population as per their Age.")
  
# show plot
plt.show()

In [None]:
ls.value_counts()

In [None]:
data=[17,35,14]
Age=['Young','Old','Very Old']
colors=('orange','grey','brown')
explode=(0.1,0,0.3)
wp={'linewidth':1, 'edgecolor':'green'}

In [None]:
def func(pct,allvalues):
    absolute=int(pct/100.*np.sum(allvalues))
    return "{:.1f}%\n({:d})".format(pct, absolute)

In [None]:
fig,ax= plt.subplots(figsize=(10,7))
wedges, texts, autotexts = ax.pie(data, 
                                  autopct = lambda pct: func(pct, data),
                                  explode = explode, 
                                  labels = Age,
                                  shadow = True,
                                  colors = colors,
                                  startangle = 90,
                                  wedgeprops = wp,
                                  textprops = dict(color ="white"))
ax.legend(wedges, Age,
          title ="Deaths as per Age",
          loc ="center left",
          bbox_to_anchor =(1, 0, 0.5,1))
  
plt.setp(autotexts, size = 8, weight ="bold")
ax.set_title("Distribution of people who died as per their Age.")
  
# show plot
plt.show()

In [None]:
plt.pie(df['Age'])

In [None]:
#train['Age']=pd.cut(train['Age'],bins=3,labels=['Young','Old','Very Old'])

In [None]:
#train['surgery_year']=train['Date_of_Surgery'].dt.year
#train['surgery_month']=train['Date_of_Surgery'].dt.month
#train['surgery_week']=train['Date_of_Surgery'].dt.week
#train['surgery_day_of_week']=train['Date_of_Surgery'].dt.dayofweek
