# Haberman's Cancer Survival: Exploratory Data Analysis



In [None]:
# importing neccessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# importing the cancer survival dataset into notebook
# the columns names are not provided in the dataset, so applying the column names in the order of the column

cancer = pd.read_csv('../input/habermans-survival-data-set/haberman.csv', names=['Age', 'Operation_year', 'Axil_Nodes', 'Survival_Status'])

In [None]:
# general info about the dataset

cancer.info()

In [None]:
# top 5 data points of the dataset

cancer.head(5)

## **Observation**

The dataset contains 4 rows and 306 columns, regarding the people who have cancer.
It contains following columns:-
1. Age of the affected people
2. Patient's year of operation (between 1958 - 1970)
3. Number of axillary nodes detected
4. Patient survived after 5 years or not.

In [None]:
# Modifying the Target variable into more meaningful categorical column

cancer['Survival_Status'] = list(map(lambda x : 'no' if (x is 2) else 'yes', cancer['Survival_Status']))

In [None]:
cancer['Survival_Status']

In [None]:
cancer['Survival_Status'].value_counts()


# survival_status column is imbalanced by 225:81
# 225 survived after 5 years
# 81 died within 5 years

In [None]:
# Statistical data of the dataset

cancer.describe()


## **Observation**
    
1. Patients are from the min. age of 30 to max. age of 83, averaging about 52 years.
2. axillary nodes are ranging from 0 to 52, averaging at 4 nodes.
3. axillary nodes detected are greater in 75%-max as compared to 0-75%.


<h2>Objective</h2>
we are trying to predict the survival status of the patients diagnosed with cancer.
<br>
we have 3 feature variables:
<li> Age of patient at the time of operation
<li> Patient's year of operation
<li> Number of positie axillary nodes detected

and a target variable:
<li>Survival Status of the patient after 5 years
<br>yes = Patient survived longer than 5 years
<br>no = Patient died within 5 years


In [None]:
# Univariate analysis with Age, Operation year and positive axil nodes detected

for idx, feature in enumerate(list(cancer.columns)[:-1]):
    sns.set_style('whitegrid')
    c = sns.FacetGrid(data=cancer, hue='Survival_Status', size=5)
    c.map(sns.distplot, feature)
    c.add_legend()


In [None]:
# Distribution Plots are effective in visually assessing the datapoints
# PDF (Probability Density Function) is created by smoothing histogram values
# CDF (Cummulative Density Function) is the odds of measuring any value upto and including x

plt.figure(figsize=(20,5))
for idx, feature in enumerate(list(cancer.columns)[:-1]):
    plt.subplot(1,3,idx+1)
    print(f'\n########## {feature} ##########')
    count, bin_edges = np.histogram(cancer[feature], bins=10, density=True)
    print(f'Bin Edges: {bin_edges}')
    pdf = count/sum(count)
    print(f'PDF: {pdf}')
    cdf = np.cumsum(pdf)
    print(f'CDF: {cdf}')
    plt.plot(bin_edges[1:], pdf, bin_edges[1:], cdf)
    plt.xlabel(feature)

In [None]:
# Box-Plots

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for idx, feature in enumerate(list(cancer.columns)[:-1]):
    sns.set_style('whitegrid')
    sns.boxplot(x='Survival_Status', y=feature, data=cancer, ax=axes[idx])

In [None]:
# Violin-Plots

fig, axes = plt.subplots(1, 3, figsize=(15,5))
for idx, feature in enumerate(list(cancer.columns)[:-1]):
    sns.set_style('whitegrid')
    sns.violinplot(x='Survival_Status', y=feature, data=cancer, ax=axes[idx])

In [None]:
# Percentage of patients with axillary nodes less than or equal to 10 and survived

anp = int(cancer[(cancer['Axil_Nodes'] <= 10) & (cancer['Survival_Status'] == 'yes')].count().unique())
p = anp/len(cancer['Axil_Nodes'])*100
print(p)

In [None]:
# Percentage of patients with axillary nodes greater than 10 and survived

anp = int(cancer[(cancer['Axil_Nodes'] > 10) & (cancer['Survival_Status'] == 'yes')].count().unique())
p = anp/len(cancer['Axil_Nodes'])*100
print(p)

In [None]:
# Percentage of patients with axillary nodes less than or equal to 10 and could not survive

anp = int(cancer[(cancer['Axil_Nodes'] <= 10) & (cancer['Survival_Status'] == 'no')].count().unique())
p = anp/len(cancer['Axil_Nodes'])*100
print(p)

In [None]:
# Percentage of patients with axillary nodes greater than 10 and could not survive

anp = int(cancer[(cancer['Axil_Nodes'] > 10) & (cancer['Survival_Status'] == 'no')].count().unique())
p = anp/len(cancer['Axil_Nodes'])*100
print(p)

In [None]:
# Percentage of patients with ages between 40-60 and survived

sp = int(cancer[(cancer['Age'] > 40) & (cancer['Age'] < 60) & (cancer['Survival_Status'] == 'yes')].count().unique())
# print(f'Patient survived between age 40-60 : {sp}')
abc = (sp/len(cancer['Age']))*100
print(abc)

In [None]:
# Percentage of patients with ages between 40-60 and could not survive

sp = int(cancer[(cancer['Age'] > 40) & (cancer['Age'] < 60) & (cancer['Survival_Status'] == 'no')].count().unique())
# print(f'Patient survived between age 40-60 : {sp}')
abc = (sp/len(cancer['Age']))*100
print(abc)

## **Observation**

1. Most of the patients affected are in the age between 40-60.
2. Years 1963-1966 had more affected cases.
3. Almost 19% patients had axillary node less than 10 but could not survive longer than 5 years



In [None]:
# Bivariate Analysis
# Pairplot reveals pair-wise relationship across entire dataset

sns.pairplot(data=cancer, hue='Survival_Status')

In [None]:
sns.jointplot(x='Age', y='Axil_Nodes', data=cancer, kind='kde')

## **Conclusion**

1. Feature column <b>Age</b> and <b>Axil_nodes_detected</b> are much better for predicting the survival of the patients.
2. Number of deaths were more in the age between 40-60
3. Patients with axillary nodes detected < 10 had less chances of surviving than the patients with higher number of axillary nodes detected

