# EDA On Haberman Dataset

## Initial setup

In [None]:
#Import Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', palette='Set1')
import warnings
warnings.filterwarnings("ignore")
import os
print(os.listdir("../input"))

%matplotlib inline

## Load and prepare dataset

In [None]:
col_head = ['Age', 'Op_year','Axil_nodes','Survived_more_than_5years']
df_haberman = pd.read_csv('../input/haberman.csv', header= None)

In [None]:
# Set Column headers
df_haberman.columns = col_head

In [None]:
df_haberman.head(5)

In [None]:
df_haberman.info()

<b>Observations:</b><br /> 
    1 All columns have got values. <br />
    2 Make Surv_status as categorical column for better understanding- <br />
    1 -> Survived more than 5 after operation,<br /> 2 -> Survived less than 5 years post operations

In [None]:
df_haberman.Survived_more_than_5years = df_haberman.Survived_more_than_5years.map({1:'Yes',2:'No'}) 
df_haberman.info()

## High Level Statistics of dataset 

In [None]:
df_haberman.describe()

In [None]:
np.percentile(df_haberman['Axil_nodes'],[25,50,75,100])

In [None]:
np.percentile(df_haberman['Op_year'],[25,50,75,100])

In [None]:
df_haberman.groupby('Survived_more_than_5years').mean()

In [None]:
df_haberman.groupby('Survived_more_than_5years').median()

In [None]:
df_haberman.Survived_more_than_5years.value_counts().plot('bar', title = 'Histogram for Class Variable')
plt.plot()

In [None]:
fig, ax = plt.subplots(1, 3)
fig.set_size_inches(16,4)
fig.suptitle('Histograms for Features')
ax[0].hist(df_haberman.Age)
ax[0].set_xlabel('Age')
ax[0].set_ylabel('Count')
ax[1].hist(df_haberman.Op_year)
ax[1].set_xlabel('Year of Operation')
ax[1].set_ylabel('Count')
ax[2].hist(df_haberman.Axil_nodes)
ax[2].set_xlabel('No. of Nodes')
ax[2].set_ylabel('Count')

<b>Observations:</b><br> 
Total data points: 306<br> 

Independent Variable: 3<br> 
-> Age (Mean: ~53 Years and Median: ~ 53 )<br> 
-> Year of operation (Between 1958 to 1969  )<br> 
-> Nodes (Mean: ~4 and Median: 1. 75% of value are below 4 though max value is 52 )<br> 

Dependent Variable: 1 -> Survived more than 5 years [Data is imbalanced with 225 'Yes' and 81 'No'] 


# Objective

<b>To predict the survival of a patient for more than 5 years or not based on his age, year of operation and number of nodes found

## Univaraite Analysis

### Distribution Plots

In [None]:
'''
->Distribution functions helps to visualize spread of datapoints.
->Here, histogram shows count of variable in a particular bin, higher the hight of bar, more are the values in that bin.
->PDF shows the probablity of variable for a particular value, for ex, in the 1st plot below, the probality of person with
  age 50 and who has not survied more than 5 years nearly .027
-> As the total probablity for outcomes of any event has to be 1, the area under the curve of PDF with KDE is 1. 
'''
for idx, col in enumerate(list(df_haberman.columns[:-1])):
    fig = sns.FacetGrid(df_haberman, hue='Survived_more_than_5years', size=5)
    fig.map(sns.distplot, col).add_legend()

<b>Observation:</b><br> 
1. No clear separation seen based on the features among patients survived more than 5 years post operations or not.<br> 
2. Mostly patients who survived more than 5 years have got less than 5 nodes as could be seen in the 3rd plot.

### PDF and CDF

In [None]:
'''
-->PDF shows for a particular value, what is the percentage of readings of that variable in the whole dataset. For ex-
for the case of not survived more than 5 years, nearly 10% of patients have got age 50
-->CDF shows the percentile or percentage of datapoints less than or eqaul to a give value for a variable. For ex-
for the case of not survived more than 5 years, nearly 40% of patients have got age 50 or below

'''

fig, ax = plt.subplots(1, 3)
fig.set_size_inches(16,4)
fig.suptitle('PDF & CDF charts - Not Survived more than five years', fontsize=12)
for idx, col in enumerate(list(df_haberman.columns[:-1])):
    counts, bin_edges = np.histogram(df_haberman[df_haberman['Survived_more_than_5years']== 'No'][col],bins= 10, density=True)
    pdf = counts/sum(counts)
    cdf = np.cumsum(pdf)
    ax[idx].plot(bin_edges[1:], pdf)
    ax[idx].plot(bin_edges[1:], cdf)
    ax[idx].set_xlabel(col)
    ax[idx].legend(['PDF','CDF'])
    
fig, ax = plt.subplots(1, 3)
fig.set_size_inches(16,4)
fig.suptitle('PDF & CDF charts - Survived more than five years', fontsize=12)
for idx, col in enumerate(list(df_haberman.columns[:-1])):
    counts, bin_edges = np.histogram(df_haberman[df_haberman['Survived_more_than_5years']== 'Yes'][col],bins= 10, density=True)
    pdf = counts/sum(counts)
    cdf = np.cumsum(pdf)
    ax[idx].plot(bin_edges[1:], pdf)
    ax[idx].plot(bin_edges[1:], cdf)
    ax[idx].set_xlabel(col)
    ax[idx].legend(['PDF','CDF'])

<b>Observation:</b><br> 
1. Around 82% of patients who got survived more than 5 years have got less than 5 nodes

### Box Plot

In [None]:
'''
Box plots helps in representation of - 
Q1 = 25% percentile 
Q2 = 50% percentile which is median value
Q3 = 75% percentil
IQR = Q3 - Q1
lesser of (Q1-1.5*IQR or min value)
lesser of (Q3+1.5*IQR or max values).
'''
fig, ax = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle('Box Plots for Features')
for idx, cols in enumerate(list(df_haberman.columns[:-1])):
    sns.boxplot('Survived_more_than_5years', cols, data= df_haberman, ax=ax[idx])

### Vilon Plot

In [None]:
'''
Vilon plot is combination of PDF and Box plot
'''
fig, ax = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle('Vilon Plots for Features')
for idx, cols in enumerate(list(df_haberman.columns[:-1])):
    sns.violinplot('Survived_more_than_5years', cols, data= df_haberman, ax=ax[idx])

<b> Observations-</b><br> 
1. From above plots, we can see that<br>  
After 1965, survival for more than 5 years is more<br> 
Before 1959, survial for more than 5 is less



## Bivaraite Analysis

### Pair Plot

In [None]:
'''
Pair Plot shows the scatter plot between pair of all combinations between columns of a dataframe
'''
sns.pairplot(df_haberman,hue = 'Survived_more_than_5years',vars=['Age', 'Op_year','Axil_nodes'], size= 3 ).fig.suptitle('Pairplot of Features')

<b> Observation:</b> <br> 
There is no clear seperation found between survival more or less than 5 years based on any to features in the pair plots 

### Joint Plot

In [None]:
plt.figure(figsize=(16,8))

for idx1, col1 in enumerate(list(df_haberman.columns[:-1])):
    for idx2, col2 in enumerate(list(df_haberman.columns[idx1:-1])):
        if col1 != col2:
            sns.jointplot(col1, col2, df_haberman[df_haberman['Survived_more_than_5years']== 'Yes'], kind = 'kde')
        else:
            pass   
                         
                            

<b>Observation:</b><br>
->Highest survival density is approximately between 0 to 5 nodes or when year of operation is approximately between 1959 </br>
to 1962 and age of patient is approximately between 49 to 54 years
    

### Seaborn 3-D Chart

In [None]:
def colx(x):
    if x == 'Yes':
        return 'green'
    elif x == 'No':
        return 'orchid'

In [None]:
df_haberman['color'] = df_haberman.Survived_more_than_5years.apply(colx)

In [None]:
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
trace1 = go.Scatter3d(
    x=df_haberman['Age'],
    y=df_haberman['Op_year'],
    z=df_haberman['Axil_nodes'],
    mode='markers',
    marker=dict(
        symbol='circle',
        color=df_haberman['color'],
        colorscale='Viridis',
        opacity=0.8,
    ))
data = [trace1]
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')

<b>Observation:</b><br>
    No clear separation of person's survival beyond 5 years seen 

# <b>Final Observations</b>

<br>1. Chances of survival for a patient with more than 5 nodes is less </br>
<br>2. Chances of survival of patient for 5+ years is more if operated in year 1965 or post </br>
<br>3. Chances of survival of patient for 5+ years is more if age during operation is between 49 to 54 </br>