In [None]:
# Loading library.
import numpy as np
import pandas as pd

# for visualization.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.style.use('tableau-colorblind10')

In [None]:
data = pd.read_csv("../input/breastcancerdataset/BRCA.csv")
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
# Valuecounts.

for col in data.select_dtypes(object):
    
    print("---"*10)
    print(data[col].value_counts())

In [None]:
# missing data.
data.isnull().sum()

In [None]:
data.loc[data.Patient_ID.isnull()]

In [None]:
# Drop columns with all NaN values.
data.dropna(how='all', inplace=True)

In [None]:
# Check shape after removing null values.
data.shape

In [None]:
# Change datatype of dates in datetime.
data['Date_of_Surgery'] = pd.to_datetime(data.Date_of_Surgery, format="%d-%b-%y") # format for 15-Jan-17
data['Date_of_Last_Visit'] = pd.to_datetime(data.Date_of_Last_Visit, format="%d-%b-%y")

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
# Examine the Age of Patients.
data['Age'] = data.Age.astype(int)
bins =[0, 20, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['10s','20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s']

data['AgeGrp']=pd.cut(data.Age, labels=labels, bins=bins, right=False)

In [None]:
plt.figure(figsize=(14, 8))
sns.countplot(x='AgeGrp', hue='Gender', data=data, saturation=1.2)

plt.title("Age Count by Gender.")
plt.show()

Breast Cancer is common in Females than Males.

In [None]:
# Examine Proteins values.
a = data.groupby(by=['Tumour_Stage']).mean()[['Protein1','Protein2','Protein3','Protein4']]
a

In [None]:
# Visualization for the same.
a.plot(kind='bar', figsize=(14, 8))

plt.title("Average of different Proteins by Tumour stage")
plt.show()

Proteins are found on tissues while testing. Protein 2 is common in all stages of Breast Cancer.

In [None]:
a = pd.DataFrame(data.groupby(['Tumour_Stage', 'Histology']).count()['Patient_ID'])
a

In [None]:
# visualization
a.unstack().plot(kind='bar', figsize=(14, 8), title="Most Common types of Breast Cancer",)
plt.show()

Most common type of Breast cancer in all stages is **Infiltrating Ductal Carcinoma**. There are no observation for Mucinous Carcinoma type of cancer in Third stage of cancer.

In [None]:
# Observation for Surgery types
a = pd.DataFrame(data.groupby(['Surgery_type', 'Histology']).count()['Patient_ID'])
# print(a)

# plot

a.unstack().plot(kind='bar', figsize=(14, 8), title='Most common Surgery by Histology(Cancer Type)')
plt.show()

Type of Surgery for **Other** is not specified. But it occurs most of the times as compare to others. After **other** common type of surgery is **Modified Radical Masterctomy** for all type of tumours.

In [None]:
# Patient_Status obeservations.
a = pd.DataFrame(data.groupby(['Surgery_type','Patient_Status']).count()['Patient_ID'])
a

In [None]:
a.unstack().plot(kind='bar',figsize=(14, 8))

plt.title("Obeservation of Patient Status")
plt.show()

In [None]:
# Patient Status visualization by age.
plt.figure(figsize=(14, 8))

sns.scatterplot(x='Age',y=data.index, hue='Patient_Status',data=data)

plt.show()

In [None]:
a = pd.DataFrame((data.groupby(by=['Surgery_type', 'Patient_Status']).count()['Patient_ID']/ data.groupby(by=['Surgery_type']).count()['Patient_ID']) * 100)
a['Patient_ID'] = a.Patient_ID.round(2)
a

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(15, 10))

for ax, code in zip(axs.flat, a.index.levels[0]):
    a.xs(code).plot(kind='pie', subplots=True, ax=ax, xlabel=code, sharex=True)

plt.show()