   # Haberman's Dataset Exploratory Data Analysis

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings 
warnings.filterwarnings("ignore")

labels = ['age', 'year', 'nodes', 'status']
#loading haberman.csv in pandas dataFrame.
haberman = pd.read_csv("../input/haberman.csv", names = labels)

In [None]:
#Dimension of Haberman dataframe?
print(haberman.shape)

In [None]:
#What are column names of Haberman dataframe?
labels = ['age', 'operation_year', 'axil_nodes', 'survived_status']
print(haberman.columns)

In [None]:
#data points for status 1 and status 2
haberman['status'].value_counts()

**Observation(s):**

The dataset is not balanced as number of datapoint for status 1 and status 2 are not equal

In [None]:
print(haberman.describe())

**Observation(s):**

std deviation of 10.80 for age shows that the spread of age and the heterogeneity is more than other features

# 2-D Scatter plot

In [None]:
haberman.plot(kind='scatter', x = 'age', y = 'year', title= 'Year vs Age');
plt.show();

**Observation(s):**

Relation between age and year is not clearly visible with this plot.

Max age is 83, min age is 39 and Max year is 69, min ayear is 58

There are relatively higher values in the middle values of the graph



In [None]:
#scatter plot with color coding to make some relevant obseravation
sns.set_style("whitegrid")
sns.FacetGrid(haberman, hue="status", size = 5)\
.map(plt.scatter, "age", "year").add_legend();
plt.title("Age vs Year");
plt.show();


** Observation(s):**

The classification of status is not possible based on age and year relationship.


## Pair plots

In [None]:
#pair-wise scatter plotting
plt.close();
sns.set_style("whitegrid");

sns.pairplot(haberman, hue = "status", vars=['age','year','nodes'] , size = 3);
plt.suptitle("pair plots of age, year and nodes");

plt.show();


**observation(s):**

    Almost all the points have less than 30 nodes.
    There is no clear seperation between status 1 and status 2

# Histogram, PDF, CDF

In [None]:
import numpy as np

haberman_survived = haberman[haberman["status"] == 1];
haberman_notsurvived = haberman[haberman["status"] == 2];

# print(haberman_survived)

# plt.plot(haberman["age"], np.zeros_like(haberman['age']), 'o')
# plt.plot(haberman["year"], np.zeros_like(haberman['year']), 'o')
plt.plot( haberman_survived["nodes"],np.ones_like(haberman_survived['nodes']), '^', label = 'survived')
plt.plot(haberman_notsurvived["nodes"], np.zeros_like(haberman_notsurvived['nodes']), 'o', label = 'not survived');

plt.xlabel("nodes");
plt.ylabel("status");
plt.legend();
plt.title("Survival vs nodes graph");

plt.show()
    

**Observation(s):**

Most of the points for this dataset are overlapping, hence it is not possible to draw some conclusion for classification based on these variables

In [None]:
sns.FacetGrid(haberman, hue = "status", size = 5)\
    .map(sns.distplot, "nodes")\
    .add_legend();
plt.ylabel("prob. dist.");
plt.title("Probability Distribution Function vs nodes")
plt.show();

 **Observation(s):**

This shows that the chances of survival increase if the number of nodes lies between 0 to 5

In [None]:
sns.FacetGrid(haberman, hue = "status", size  = 5).map(sns.distplot, "age").add_legend();
plt.ylabel("prob. dist.");
plt.title("PDF for age");
plt.show()

**Observation(s):**

There is no significant difference to classify the survival and non survival on the basis of age

# Mean, Variance and Std-dev


In [None]:
print("Means:")
print("survived")
print(np.mean(haberman_survived['age']))
print(np.mean(haberman_survived['year']))
print(np.mean(haberman_survived['nodes']))

print()

print("not survived")
print(np.mean(haberman_notsurvived['age']))
print(np.mean(haberman_notsurvived['year']))
print(np.mean(haberman_notsurvived['nodes']))

print()

print("std:")
print(np.std(haberman['age']))
print(np.std(haberman['year']))
print(np.std(haberman['nodes']))

**Observation(s):**

Large std for age shows the large spread and the heterogeneity in age

#  Median, Percentile, Quantile, IQR, MAD

In [None]:
print("\nMedians:")
print(np.median(haberman["age"]))
print(np.median(haberman["nodes"]))
print(np.median(haberman["year"]))


print("\nQuantiles:")
print(np.percentile(haberman["age"],np.arange(0, 100, 25)))
print(np.percentile(haberman["nodes"],np.arange(0, 100, 25)))
print(np.percentile(haberman["year"], np.arange(0, 100, 25)))

print("\n90th Percentiles:")
print(np.percentile(haberman["age"],90))
print(np.percentile(haberman["nodes"],90))
print(np.percentile(haberman["year"], 90))

from statsmodels import robust
print ("\nMedian Absolute Deviation")
print(robust.mad(haberman["age"]))
print(robust.mad(haberman["nodes"]))
print(robust.mad(haberman["year"]))

**Observation(s):**

Median Absolute Deviation for age shows the heterogeneity in age is more than other features

IQR of age (75th percentile - 25th percentile) shows that the 50 percent of age lies in range 44 - 52

# Box plot and whiskers

In [None]:
plt.subplot(121)
plt.title("boxplot for status and nodes");
sns.boxplot(x='status',y='nodes', data=haberman, hue = 'status')

plt.subplot(122)
sns.boxplot(x='status', y ='age', data=haberman, hue= 'status')
plt.title("boxplot for status and age");
plt.show()

**Observation(s):**

50 percent of survials have nodes between 0-4

50 percent of non survials have nodes between 2-11

50 percent of survivals have age 44 to 60 

50 percent of non survivals have age 46-62

# Violin Plots

In [None]:
sns.violinplot(x = 'status', y = 'nodes', data = haberman, hue= 'status')
plt.title("Violin plot for nodes vs status")
plt.show()

**Observation(s):**

50 percent of suvivor have nodes between 0 and 4

The more spread of the non suvivors(status 2) shows that range of nodes is larger than survivors

# Observations / Summary

The data is overlapping in most of the region, hence it is not seperable using these features

The changes of survival are more for the nodes ranging from 0 to 4

50 percent of survials have nodes between 0-4

50 percent of non survials have nodes between 2-11

50 percent of survivals have age 44 to 60

50 percent of non survivals have age 46-62
