# EDA on Haberman Dataset

## About the Dataset
The dataset contains cases from a study that was conducted between 1958 and 1970 
at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery 
for breast cancer.

Attributes:

1. Age of patient at time of operation
2. Patient's year of operation
3. Number of positive axillary nodes detected
4. Survival status (class attribute)
      1. 1= the patient survived 5 years or longer
      2. 2= the patient died within 5 year


## Objective:
To find whether a patient will survive a given treatment or not

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import kde


## 1. Loading Data

In [None]:
column_name = ['Age' , 'Operation_Year' ,'axil_nodes' , 'Surv_status' ]
surv_data = pd.read_csv('../input/haberman.csv' , header = None , names = column_name)

In [None]:
# surv_data.head() will print first 5 rows
surv_data.head()

In [None]:
# printing number of data points and features
surv_data.shape

In [None]:
surv_data['Surv_status'].value_counts() # imbalanced dataset

In [None]:
print('Total patient survived >= 5 years: {}%'.format((225/306)*100))


## Obseravation
1. The dataset has 306 data points and 3 features Age, Operation_Year, axil_node
2. Imbalanced dataset
3. Out of 306 patients 81 patients survived less than 5 years and rest 225 survived 5 and more years

In [None]:
# patients who survived more than 5 years
surv_more = surv_data[surv_data['Surv_status']==1]
surv_more.describe()

In [None]:
# patients who survived less than 5 years
surv_less = surv_data[surv_data['Surv_status']==2]
surv_less.describe()

## Observation
1. Mean Age and Operation_Year of both class are almost similar while mean of axial_node of both class differ by ~5 units
2. axil_node for patients who survived is less compare to patient who did not survived

# 2. Univariate analysis

Using histogram and pdf to extract a feature which is suitable to classify our class attribute

In [None]:

# AGE
sns.FacetGrid(surv_data , hue = 'Surv_status' , size = 5).map(sns.distplot , 'Age').add_legend();
plt.show();

In [None]:
# OPERATION YEAR
sns.FacetGrid(surv_data , hue = 'Surv_status' , size =4).map(sns.distplot , 'Operation_Year').add_legend();
plt.show()

In [None]:
# AXIL NODES
sns.FacetGrid(surv_data , hue = 'Surv_status' , size =4).map(sns.distplot , 'axil_nodes').add_legend();
plt.show()

# Observation:
1. There is huge overlapping in all three plots
2. The pdf of axil_nodes for patients who survived more than 5 years is more dense near 0


In [None]:
#cdf on axil_node 

count1 , bin_edges1 = np.histogram(surv_more['axil_nodes'] , bins = 10 )
pdf1 = count1/sum(count1)
print(pdf1)
print(bin_edges1)
cdf1 = np.cumsum(pdf1)
plt.plot(bin_edges1[1:] , pdf1)
plt.plot(bin_edges1[1:], cdf1 , label = 'surv >= 5 years')


count2 ,bin_edges2 = np.histogram(surv_less['axil_nodes'] , bins=10)
pdf2 = count2/sum(count2)
print(pdf2)
print(bin_edges2)
cdf2 = np.cumsum(pdf2)
plt.plot(bin_edges2[1:] , pdf2)
plt.plot(bin_edges2[1:] , cdf2 , label = 'surv < 5 years')
plt.xlabel('axil_nodes')
plt.legend()
plt.show()



In [None]:
# cdf on Age

count1 , bin_edges1 = np.histogram(surv_more['Age'] , bins = 10 )
pdf1 = count1/sum(count1)
print(pdf1)
print(bin_edges1)
cdf1 = np.cumsum(pdf1)
plt.plot(bin_edges1[1:] , pdf1)
plt.plot(bin_edges1[1:], cdf1 , label = 'surv >= 5 years')


count2 ,bin_edges2 = np.histogram(surv_less['Age'] , bins=10)
pdf2 = count2/sum(count2)
print(pdf2)
print(bin_edges2)
cdf2 = np.cumsum(pdf2)
plt.plot(bin_edges2[1:] , pdf2)
plt.plot(bin_edges2[1:] , cdf2 , label = 'surv < 5 years')
plt.xlabel('Age')
plt.legend()
plt.show()

In [None]:
#cdf on Operation_Year

count1 , bin_edges1 = np.histogram(surv_more['Operation_Year'] , bins = 10 )
pdf1 = count1/sum(count1)
print(pdf1)
print(bin_edges1)
cdf1 = np.cumsum(pdf1)
plt.plot(bin_edges1[1:] , pdf1)
plt.plot(bin_edges1[1:], cdf1 , label = 'surv >= 5 years')


count2 ,bin_edges2 = np.histogram(surv_less['Operation_Year'] , bins=10)
pdf2 = count2/sum(count2)
print(pdf2)
print(bin_edges2)
cdf2 = np.cumsum(pdf2)
plt.plot(bin_edges2[1:] , pdf2)
plt.plot(bin_edges2[1:] , cdf2 , label = 'surv < 5 years')
plt.xlabel('Operation_Year')
plt.legend()
plt.show()

## Observation
1. from cdf plot of axil_node 83.55% patients who survived more than 5 years had axil_node in range b/w 0-4.6
2. from cdf plot of age approx 84% patients who survived more had age b/w 30-48.8

In [None]:
# box and violin plot ---> axil_nodes
plt.subplot(121)
sns.boxplot(x='Surv_status' , y='axil_nodes' , data = surv_data )
plt.subplot(122)
sns.violinplot(x='Surv_status' , y='axil_nodes' , data = surv_data )
plt.show()

In [None]:
# box and violin plot ---> Age
plt.subplot(121)
sns.boxplot(x='Surv_status' , y='Age' , data = surv_data)
plt.subplot(122)
sns.violinplot(x='Surv_status' , y='Age' , data = surv_data)
plt.show()

In [None]:
plt.subplot(121)
sns.boxplot(x='Surv_status' , y='Operation_Year' , data = surv_data)
plt.subplot(122)
sns.violinplot(x='Surv_status' , y='Operation_Year' , data = surv_data)
plt.show()

# Multivariate analysis

Using Pair plot to identify pair which can classify our class better

In [None]:
# Pair Plot

plt.close()
sns.set_style('whitegrid')
sns.pairplot(surv_data, hue ='Surv_status',vars = ['Age' , 'Operation_Year' ,'axil_nodes' ], size =3)
plt.show();

# Observation
1.  The scatter plot b/w axil_nodes and Operation_year is comparatively better at separating the two classes among other plots

In [None]:
sns.jointplot(x='axil_nodes' , y = 'Age' , data = surv_more , kind = 'kde')
plt.show()

In [None]:
#2-D density plot
plt.close()
sns.jointplot(x='Operation_Year' , y='axil_nodes' , data = surv_more , kind='kde')
plt.show()

## Final Conclusion
1. we can use axil_node to identify the class
2. survived patients mostly have lesser value of axil_node
3. Younger people has more chance of survival