### EDA of Haberman's Survival Data
https://www.kaggle.com/gilsousa/habermans-survival-data-set/data

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('../input/haberman.csv', names=['age','operation_year','positive_nodes','survival_status'])

In [None]:
data.head(5)

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.columns

#### Unique values of fields

In [None]:
print("age\n", data.age.unique())
print("operation year\n", data.operation_year.unique())
print("survival status\n", data.survival_status.unique())
print("positive nodes\n", data.positive_nodes.unique())

#### So survival_status clearly says that it is a categorical field with values 1 and 2
1 = the patient survived 5 years or longer 
2 = the patient died within 5 year

## Our objective is to find how and why many people survived based on their age, the operation year and the number of positive axillary nodes detected

In [None]:
data.survival_status.value_counts()

## Dividing survival_status data

In [None]:
survived_data = data.loc[data['survival_status'] == 1]
non_survived_data = data.loc[data['survival_status'] == 2]

In [None]:
data.age.value_counts(ascending=False).head(12)

In [None]:
data.operation_year.value_counts(ascending=False)

In [None]:
data.positive_nodes.value_counts(ascending=False).head(12)

### Pair Plot

In [None]:
plt.close()
sns.set_style('whitegrid')
sns.pairplot(data, hue='survival_status', vars=["age", "operation_year", 'positive_nodes'], size=4)
plt.show()

### Each pair doesn't separate features properly. Mostly the dots are scattered and we can't decide based on these plots.

### Analysis of operation year and survival status

In [None]:
sns.set_style('whitegrid')
plot = sns.FacetGrid(data, hue='survival_status', size=4) \
   .map(sns.distplot, 'operation_year', bins=10)\
   .add_legend()
plot.set_axis_labels('operation year', 'density')

#### Here the survived people and unsurvived people's data are overlapping each other, so let's use boxplot

In [None]:
sns.boxplot(x='survival_status', y='operation_year', data=data)

#### By looking at the box plot we just make an assumption that unsurvived people were operated between 59 and 65 and the death rates reduced after 1963, also the people who were survived were operated between 60 and 66. 
#### Does it mean that after 1960 operations got better?

In [None]:
sns.violinplot(x='survival_status', y='operation_year', data=data)

#### The violin plot shows clearly that there was a huge rise of death rate between 1961 and 1965 and then reduced.
#### survived patients had the highest survival rate in 1960 and then it dropped slowly and constantly.

In [None]:
operation_year_less_than_60_who_survived = (data.survival_status == 1) & (data.operation_year < 60)
operation_year_greater_than_60_who_survived =  (data.survival_status == 1) & (data.operation_year >= 60)

operation_year_less_than_60_who_survived = data[operation_year_less_than_60_who_survived]
operation_year_greater_than_60_who_survived = data[operation_year_greater_than_60_who_survived]

In [None]:
operation_year_less_than_60_who_did_not_survive = (data.survival_status == 2) & (data.operation_year < 60)
operation_year_greater_than_60_who_did_not_survive =  (data.survival_status == 2) & (data.operation_year >= 60)

operation_year_less_than_60_who_did_not_survive = data[operation_year_less_than_60_who_did_not_survive]
operation_year_greater_than_60_who_did_not_survive = data[operation_year_greater_than_60_who_did_not_survive]

In [None]:
print("The total people who survived for the operation year less than 60 is %d"% operation_year_less_than_60_who_survived.operation_year.count())
print("The total people who survived for the operation year greater than 60 is %d"% operation_year_greater_than_60_who_survived.operation_year.count())
print("The total people who did not survive for the operation year less than 60 is %d"% operation_year_less_than_60_who_did_not_survive.operation_year.count())
print("The total people who did not survive for the operation year greater than 60 is %d"% operation_year_greater_than_60_who_did_not_survive.operation_year.count())

#### 60% of the people who survived was operated after 1960 and 14% of the people were operated before 1960.
#### 7% of the people who didn't survive was operated after 1960 and 20% of the people were operated before 1960.
#### It could mean that medical facilities got better after 1960 and patients have been survived.

#### 73% of the total people were survived and 27% could not be survived.

### Analysis of operation year and survival status

In [None]:
sns.set_style('whitegrid')
plot = sns.FacetGrid(data, hue='survival_status', size=4) \
   .map(sns.distplot, 'age', bins=10)\
   .add_legend()
plot.set_axis_labels('age', 'density')

#### Here also the survived people and unsurvived people's data are overlapping each other, so let's use boxplot

In [None]:
sns.boxplot(x='survival_status', y='age', data=data)

#### By looking at the box plot we can just make an assumption that patients died more when they cross passed 45 years of age.

In [None]:
sns.violinplot(x='survival_status', y='age', data=data)

#### The violin plot here shows that there was a huge rise of death rate in 1950.
#### survived patients had the highest survival rate in 1953.

In [None]:
age_less_than_52_who_survived = (data.survival_status == 1) & (data.age < 52)
age_greater_than_52_who_survived =  (data.survival_status == 1) & (data.age >= 52)

age_less_than_52_who_survived = data[age_less_than_52_who_survived]
age_greater_than_52_who_survived = data[age_greater_than_52_who_survived]

In [None]:
age_less_than_52_who_did_not_survive = (data.survival_status == 2) & (data.age < 52)
age_greater_than_52_who_did_not_survive =  (data.survival_status == 2) & (data.age >= 52)

age_less_than_52_who_did_not_survive = data[age_less_than_52_who_did_not_survive]
age_greater_than_52_who_did_not_survive = data[age_greater_than_52_who_did_not_survive]

In [None]:
print("The total people who survived for the age less than 52 is %d"% age_less_than_52_who_survived.age.count())
print("The total people who survived for the age greater than 52 is %d"% age_greater_than_52_who_survived.age.count())
print("The total people who did not survive for the age less than 52 is %d"% age_less_than_52_who_did_not_survive.age.count())
print("The total people who did not survive for the age greater than 52 is %d"% age_greater_than_52_who_did_not_survive.age.count())

#### 35% of the people who survived were having age less than 52 and 39% of the people were having age more than 52 .
#### 11% of the people who didn't survive were having age less than 52 and 15% of the people were having age more than 52.
#### So, 25% of the people could not be survived

#### It is clear that age is not the factor to decide if a patient survived or not

### Analysis of positive axillary nodes and survival status

In [None]:
sns.set_style('whitegrid')
plot = sns.FacetGrid(data, hue='survival_status', size=4) \
   .map(sns.distplot, 'positive_nodes', bins=10)\
   .add_legend()
plot.set_axis_labels('positivenodes', 'density')

#### Can't see any difference except survival status has more values

In [None]:
sns.boxplot(x='survival_status', y='positive_nodes', data=data, width=0.9)

#### We can see that the above plot is the only plot which shows a difference between the survival status for the detected positive axillary nodes.
#### But patients who has been survived were having axillary nodes less than 5

In [None]:
nodes_less_than_4_who_survived = (data.survival_status == 1) & (data.positive_nodes < 4)
nodes_greater_than_4_who_survived =  (data.survival_status == 1) & (data.positive_nodes >= 4)

nodes_less_than_4_who_survived = data[nodes_less_than_4_who_survived]
nodes_greater_than_4_who_survived = data[nodes_greater_than_4_who_survived]

In [None]:
nodes_less_than_4_who_did_not_survive = (data.survival_status == 2) & (data.positive_nodes < 4)
nodes_greater_than_4_who_did_not_survive =  (data.survival_status == 2) & (data.positive_nodes >= 4)

nodes_less_than_4_who_did_not_survive = data[nodes_less_than_4_who_did_not_survive]
nodes_greater_than_4_who_did_not_survive = data[nodes_greater_than_4_who_did_not_survive]

In [None]:
print("The total people who survived for the axillary nodes less than 4 is %d"% nodes_less_than_4_who_survived.positive_nodes.count())
print("The total people who survived for the axillary nodes greater than 4 is %d"% nodes_greater_than_4_who_survived.positive_nodes.count())
print("The total people who did not survive for the axillary nodes less than 4 is %d"% nodes_less_than_4_who_did_not_survive.positive_nodes.count())
print("The total people who did not survive for the axillary nodes greater than 4 is %d"% nodes_greater_than_4_who_did_not_survive.positive_nodes.count())

#### Positive axillary nodes having a value less than 4 alone is having almost 59% of data where patients are survived.
#### It means that axillary nodes can be also used to decide if patients are survived or not.

## The final conclusion is that the patients which survived more were operated after 1960 as well as having positive axillary nodes which was less than 4.