In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## High level statistics of the dataset: number of points, numer of features, number of classes, data-points per class.

In [None]:
haberman = pd.read_csv('../input/haberman.csv')
print('Number of points, number of attributes: {}'.format(haberman.shape))

In [None]:
print('Features and classes: {}'.format(haberman.columns))
print('30 : Age')
print('64 : Op_Year')
print('1 : axil_nodes_det')
print('1.1 : Surv_status (Class attribute) 1 = the patient survived 5 years or longer; 2 = the patient died within 5 year')

In [None]:
print('Domain Knowledge:')
print('Positive axillary lymph node: A positive axillary lymph node is a lymph node in the area of the armpit (axilla) to which cancer has spread. This spread is determined by surgically removing some of the lymph nodes and examining them under a microscope to see whether cancer cells are present.')

In [None]:
print('Count of survived(1)/unsurvived(2) patients:')
count =  haberman['1.1'].value_counts()
print(count)

## Objective: Classify whether a new patient will survive(1) or die within 5 years(2) given the 3 features: age, Op_year and axil_nodes_det.

## <u>Univaraite analysis</u>

### Historam and PDF: 30(Age)

In [None]:
sns.FacetGrid(haberman, hue='1.1', height=5).map(sns.distplot, '30').add_legend()
plt.show()

The PDF for the age of patients gives quite a bit overlap for the survived(blue) patients and the patients who died(orange). It's hard to come up with any pattern/model out of this distribution. Let's see what Op_Year feature tells us.

### Historam and PDF: 64(Op_Year)

In [None]:
sns.FacetGrid(haberman, hue='1.1', height=5).map(sns.distplot, '64').add_legend()
plt.show()

Well, even the PDF for the feature Op_Year gives huge overlap for the survived(1) and unsurvived(2) patients. But if we go a little off track here, the dip in the distrubution of unsurvived patients shows an interesting fact: there is a dip from the year 59 to around year 61 and then it increases back from 61 to around 65. So, it means for some reason there seems to be a decline in death rate for the period 1959 to 1961.

### Historam and PDF: 1(axil_nodes_det)

In [None]:
sns.FacetGrid(haberman, hue='1.1', height=5).map(sns.distplot, '1').add_legend()
plt.show()

Once again when using number of positive auxilary nodes as feature, we still get a big overlap, which makes it diffcult to build any model using just this variable. But talking about individual plots, both of the plots look to be skewed.

So, it says that the count of patients who survived is higher when the number of postive auxilary nodes detected is lower.
But same goes for the unsurvived patients, the count seems to be higher when the auxiliary nodes detected are lower, but this distribution is a _little_ less skewed when comparing with the distribution for survived patients.

### PDF and CDF: 30(Age)

In [None]:
hist, bin_edges = np.histogram(haberman.loc[haberman['1.1']==1, '30'], density=True)
pdf = hist/sum(hist)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

hist, bin_edges = np.histogram(haberman.loc[haberman['1.1']==2, '30'], density=True)
pdf = hist/sum(hist)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

plt.grid(True)
plt.show()

The CDF of the feature age shows that only 5% of the died patients(red) were 40 years old or younger, while around 18% of survived patients(orange) are in the same age group.
Moving on, around 21% of died patients are 45 years old or younger, while around 30% of survived patients are in the same age group.
The two curves meet at the age of around 48 years at the percentage value of around 36% i.e. there are around 36 % of survived patients and 36% of died patients who are 48 years old or younger. Post that there is a considerable overlap.

Comparing the PDFs there seems a higher probability of survival(blue) as compared to death(green) if the age of the patient is less than 44. There is higher probability of death than survival for the age bracket 44 to 56. For the rest of the age values the probabilities overlap more or less.

### PDF and CDF: 64(Op_Year)

In [None]:
hist, bin_edges = np.histogram(haberman.loc[haberman['1.1']==1, '64'], density=True)
pdf = hist/sum(hist)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

hist, bin_edges = np.histogram(haberman.loc[haberman['1.1']==2, '64'], density=True)
pdf = hist/sum(hist)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

plt.grid(True)
plt.show()

### PDF and CDF: 1(axil_nodes_det)

In [None]:
hist, bin_edges = np.histogram(haberman.loc[haberman['1.1']==1, '1'], density=True)
pdf = hist/sum(hist)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

hist, bin_edges = np.histogram(haberman.loc[haberman['1.1']==2, '1'], density=True)
pdf = hist/sum(hist)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

plt.grid()
plt.show()

Again there is quite a bit overlap between PDF CDF distribution for the feature axillary nodes detected.

If we comapre the PDFs of survived(blue) and died(green), one interesting observation that can be drawn out is that there is higher probability of survival, if the number of axillary nodes detected is below 8, as compared to probability of death. But if the number of auxillary nodes detected is between 8 and 31, the probability of death is more than the probability of survival.
Post the count of 31, there seems to be a reasonable overlap.

CDF distribution of survived(orange) and died(red) says that around 82% of people who survived were found with 5 or low number of axillary nodes, while there were only around 57% of people who died with the same number of axillary nodes found.
Similary 90% of suvived patients had their axillary nodes count equal to or less than 10, while there were 70% of died patients with same number of count.
The distribution continues in the similar fashion until the two curves meet at axillary count of around 26, where both the survived and died patients have around 99% of them with the axillary nodes detected equal to or less than 26. After that there is a considerate overlap.

Studying the PDFs and CDFs of all the features, the feature operation year seems to be least useful. Feature age and feature axillary node count both also shows quite a bit overlap. If I have to choose one, I would go with the feature axillary counts detected and would write a model based on the observation of its PDF distribution.

### Boxplot: 30(Age)

In [None]:
sns.boxplot(x='1.1', y='30', data=haberman)
plt.show()

The above bloxpot says that 25th percentile value of age for the survived patients is around 42 years old. 50th percentile(median) value is 51 years old. And 75th percentile value is 60 years old. Minimum age of the survived patient is 30 years old and maximum age is 76 years old.

Similarly 25th percentile value of age for unsurvived patients is around 45 years old. 50th percentile(median) value is around 52. And 75th percentile value is 61. Minimum age for unsurvived patients is around 34 years old and maximum age for unsurvived patient is beyond 80 years old.

Which means that out of all survived patients whose age spreads from 30 years old to 76 years old around 50% are in the age group of 42 years to 60 years old.

Similary out of all survived patients whose age spreads from 34 years old to 80 plus years old around 50% are in the age group of 45 years to 61 years old.

Seems like quite a big overlap.

### Boxplot: 64(Op_Year)

In [None]:
sns.boxplot(x='1.1', y='64', data=haberman)
plt.show()

### Boxplot: 1(axil_nodes_det)

In [None]:
sns.boxplot(x='1.1', y='1', data=haberman)
plt.show()

The box plot of number of axillary nodes detected for survived patients shows that the minimum value, 25th percentile and 50th percentile all lies around 0. And the 75th percentile is around 2.5. Which means that around 75% of the survived patients are those who had the count of axillary nodes less than 3. The problem with this box plot is that there are quite a lot of outliers beyond the maximum axillary count of 6. Which means many a times patients with quite high number of axillary nodes also survive.

For unsurvived patients there are low number of outliers beyond the maximum value of 24 axillary nodes. Minimum value is 0 with 25th percentile as 1 and median as 3(which crosses the 75th percentile value of axillary nodes count for survived patients). The 75th percentile value for unsurvived patients is 11. This means 50% of total number of unsurvived patients have axillay nodes count from 1 to 11.

This plot looks quite better than the plots for the feature age and op_year, except of the fact that there are quite a number of outliers for survived patients.

### Violin plot: 30(Age)

In [None]:
sns.violinplot(x='1.1', y='30', data=haberman)
plt.show()

### Violin plot: 64(Op_Year)

In [None]:
sns.violinplot(x='1.1', y='64', data=haberman)
plt.show()

### Violin plot: 1(axil_nodes_det)

In [None]:
sns.violinplot(x='1.1', y='1', data=haberman)
plt.show()

Violin plot for axillary nodes count is wider for survived patients when the value is lower as compared to that of unsurvived patients. i.e. it says there are comparatively more people who survives than those who don't when axiallry nodes found are lower in number.

## <u>Bi-variate analysis</u>

In [None]:
sns.pairplot(haberman, hue='1.1', vars=['30', '64', '1'], size=3)
plt.show()

There is complete overlap in all three pairs. Bivariate analysis is not much helpful on this dataset.