# Exploratory Data Analysis of HabermanDataset

### Importing packages for EDA

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### Read csv file and assigning column labels

In [None]:
columns = ['PatientAge', 'PatientYear', 'AxillaryNodes', 'SurvivalStatus']
haberman = pd.read_csv('../input/haberman.csv', names = columns)

Description of the Haberman Dataset:
    1. Age of patient at time of operation (Numerical)
    2. Patient's year of operation (year - 1900, numerical)
    3. Number of positive axillary nodes detected (numerical)
    4. Survival status (class attribute)
            1 = the patient survived 5 years or longer
            2 = the patient died within 5 year

### Head and tail of the data

In [None]:
haberman.head()

In [None]:
haberman.tail()

### Dimensions of the data set 

In [None]:
print(haberman.shape)
print("Haberman Dataset contains {} rows and {} columns".format(*haberman.shape))

In [None]:
print(haberman.columns)

### Value counts for the SurvivalStatus feature

In [None]:
print("Unique values in the SurvivalStatus are {}".format(haberman.SurvivalStatus.unique()))
haberman['SurvivalStatus'].value_counts()

Observations:

1. The years_of_operation gives the last two digits of the year for each patient.
2. There are 306 observations in the dataset. 
3. The dataset is classified into two classes.
4. 225 patients of class 1, those who survived and, 81 patients of class 2, those who not survived.

### Datatype of each feature

In [None]:
print(haberman.info())
print('*'*50)
print(haberman.describe())

print('*'*50)

for i in range(4):
    print("Class of {} is {}".format(haberman.columns[i],type(haberman.iloc[i][0])))

### Observations:

1. As there is no missing values in this dataset no need to change the values.
2. The datatype of Categorical variable SurvivalStus column is integer, has to be converted to categorical datatype.
3. Values are mapped as follows: 
   1 : yes
   2 : no

In [None]:
haberman['SurvivalStatus'] = haberman['SurvivalStatus'].map({1:"yes", 2:"no"})
haberman['SurvivalStatus'] = haberman['SurvivalStatus'].astype('category')
print(haberman.head())
haberman['SurvivalStatus'].value_counts(normalize = True)

### Observations

The age of the patients vary from 30 to 83 with the median of 52.

Although the maximum number of positive lymph nodes observed is 52, nearly 75% of the patients have less than 5 positive lymph nodes and nearly 25% of the patients have no positive lymph nodes

The target column is imbalanced with 73% of values are 'yes'

## Univariant Analysis

In [None]:
for index, column in enumerate(list(haberman.columns)[:-1]):
    fg = sns.FacetGrid(haberman, hue='SurvivalStatus', size=5)
    fg.map(sns.distplot, column).add_legend()
    plt.show()

In [None]:
for index, column in enumerate(list(haberman.columns)[:-1]):
    print(column)
    counts, edges = np.histogram(haberman[column], bins=10, density=True)
    pdf = counts/sum(counts)
    print("PDF: {}".format(pdf))
    cdf = np.cumsum(pdf)
    print("CDF: {}".format(cdf))
    plt.plot(edges[1:], pdf, edges[1:], cdf)
    plt.xlabel(column)
    plt.show()

## Mean, Variance, Std. Deviation

In [None]:
import numpy as np
survivalstatus_one = haberman.loc[haberman["SurvivalStatus"] == 'yes']
survivalstatus_two = haberman.loc[haberman["SurvivalStatus"] == 'no']

#Mean, Variance, Std-deviation,  
print("Means:")
for column in list(haberman.columns)[:-1]:
    print("Mean of {} for Survival Status == yes is {} ". format(column, np.mean(survivalstatus_one[column])))
    print("Mean of {} for Survival Status == no is {} ". format(column, np.mean(survivalstatus_two[column])))
    print('*'*50)

print("Medians:")

for column in list(haberman.columns)[:-1]:
    print("Median of {} for Survival Status == yes is {} ". format(column, np.median(survivalstatus_one[column])))
    print("Median of {} for Survival Status == no is {} ". format(column, np.median(survivalstatus_two[column])))
    print('*'*50)

print("Std Deviations: ")
for column in list(haberman.columns)[:-1]:
    print("Std. Deviation of {} for Survival Status == yes is {} ". format(column, np.std(survivalstatus_one[column])))
    print("Std. Deviation of {} for Survival Status == no is {} ". format(column, np.std(survivalstatus_two[column])))
    print('*'*50)


### Box Plot

In [None]:
figure, axes = plt.subplots(1, 3, figsize=(20, 5))
for index, column in enumerate(list(haberman.columns)[:-1]):
    sns.boxplot( x='SurvivalStatus', y=column, data=haberman, ax=axes[index])
plt.show()  

In [None]:
from statsmodels import robust
print("\nQuantiles for Status survival type 1")
for column in list(haberman.columns)[:-1]:
    print("Quantiles of {} are {}".format(column, np.percentile(survivalstatus_one[column],np.arange(0, 100, 25))))
    print("90th Quantile of {} is {}".format(column, np.percentile(survivalstatus_one[column],90)))
    print("MAD of {} is {}".format(column,robust.mad(survivalstatus_one[column])))
    print('*'*50)

print("\nQuantiles for Status survival type 2")
for column in list(haberman.columns)[:-1]:
    print("Quantiles of {} are {}".format(column, np.percentile(survivalstatus_two[column],np.arange(0, 100, 25))))
    print("90th Quantile of {} is {}".format(column, np.percentile(survivalstatus_two[column],90)))
    print("MAD of {} is {}".format(column,robust.mad(survivalstatus_two[column])))
    print('*'*50)


### Voilin Plot

In [None]:
figure, axes = plt.subplots(1, 3, figsize=(20, 5))
for index, column in enumerate(list(haberman.columns)[:-1]):
    sns.violinplot( x='SurvivalStatus', y=column, data=haberman, ax=axes[index])
plt.show() 

### Observations

A violin plot is more informative than a plain box plot. In fact while a box plot only shows summary statistics such as mean/median and interquartile ranges, the violin plot shows the full distribution of the data. The difference is particularly useful when the data distribution is multimodal (more than one peak). In this case a violin plot clearly shows the presence of different peaks, their position and relative amplitude. 

1. In box plot we can only observe the summary statitics where as in violin plot it is very clear that Patient age and Patient Year are bi modal in nature.
2. Axillary nodes has more outliers and it is right/ positive skewed so that the descriptive statistic mean is very much influenced by those points

## Multi variant analysis 

### Scatter Plot

In [None]:
sns.set_style("whitegrid");
sns.FacetGrid(haberman, hue="SurvivalStatus", size=4) \
   .map(plt.scatter, "PatientAge", "AxillaryNodes") \
   .add_legend();
plt.show();

### Pair Plot

In [None]:
sns.pairplot(haberman, hue='SurvivalStatus', size=4)
plt.show()

### Observation

With all the current data available, Patient Year and Axillary Nodes have a plot which is visually differential between the two clases than other scatter plots.