# Exploratory data analysis on Haberman dataset

## Loading the dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

haberman = pd.read_csv('../input/haberman.csv', header=None, names=['Age', 'Op_year', 'axil_nodes', 'surv_status'])
haberman.head()

## Objective:

Perform EDA to find the best feature(s) for reasonable classification.

## High level statistics

In [None]:
#The column names in our dataset:
print(haberman.columns)

Observation: 'surv_status' is the class label.

In [None]:
#How many data points and features?
print(haberman.shape)

Observation: The dataset has got 305 rows and 4 columns.

In [None]:
#Lets find out how many patients have lived more than 5 years and how many have not.
haberman["surv_status"].value_counts()

Observation: Its an imbalanced dataset

In [None]:
#The column 'surv_status' has been treated as numerical values here
#Since it is the class label it must be treated as a categorical feature 
#lets convert 'surv_status' into categorical values
haberman['surv_status'] = haberman['surv_status'].map({1:"yes", 2:"no"})
haberman['surv_status'] = haberman['surv_status'].astype('category')

In [None]:
#Some more high level statistics
print(haberman.describe())

## Bivariate Analysis

In [None]:
#Plot scatter plots
haberman.plot(kind='scatter', x='Age', y='Op_year')
plt.show()

Observation: The plot does not give any valuable information

In [None]:
#Lets uniquely colour the datapoints on the basis of class label
sns.set_style("whitegrid");
sns.FacetGrid(haberman, hue="surv_status", height=4) \
   .map(plt.scatter, "Age", "Op_year") \
   .add_legend();
plt.show();

Observation: Yet again we are unable to make significant classification based on the plot

In [None]:
#Lets plot the class label on the basis of pair of all variables
#Pair plots
plt.close();
sns.set_style("whitegrid");
sns.pairplot(haberman, hue="surv_status", height=3);
plt.show()

Observations:
1. The plots do not give much information to make a reasonable classification
2. However, the plot on the basis of 'Op_year' and 'axil_nodes' is somewhat reasonable
3. In the 2nd last and 3rd last plot for any 'Op_year' and 'Age' respectively if number of axil nodes is less than 9 the patient will live for more than 5 years.

## Univariate Analysis

In [None]:
haberman_yes = haberman.loc[haberman["surv_status"] == "yes"];
haberman_no = haberman.loc[haberman["surv_status"] == "no"];
#print(iris_setosa["petal_length"])
plt.plot(haberman_yes["Age"], np.zeros_like(haberman_yes['Age']), 'o')
plt.plot(haberman_no["Age"], np.zeros_like(haberman_no['Age']), 'o')
plt.show()

plt.plot(haberman_yes["Op_year"], np.zeros_like(haberman_yes['Op_year']), '^')
plt.plot(haberman_no["Op_year"], np.zeros_like(haberman_no['Op_year']), '^')
plt.show()

plt.plot(haberman_yes["axil_nodes"], np.zeros_like(haberman_yes['axil_nodes']), 'D')
plt.plot(haberman_no["axil_nodes"], np.zeros_like(haberman_no['axil_nodes']), 'D')
plt.show()


In [None]:
#Lets plot the PDF's for all the features.
#Age
sns.FacetGrid(haberman, hue="surv_status", size=5) \
   .map(sns.distplot, "Age") \
   .add_legend();
plt.show();

#Op_year
sns.FacetGrid(haberman, hue="surv_status", size=5) \
   .map(sns.distplot, "Op_year") \
   .add_legend();
plt.show();

#axil_nodes
sns.FacetGrid(haberman, hue="surv_status", size=5) \
   .map(sns.distplot, "axil_nodes") \
   .add_legend();
plt.show();


Observations:
1. The first and the second plots are messy.
2. If the number of axil nodes lie between 0 to 2, the probability of the patient living more than 5 years is significantly        higher.

In [None]:
#Plot CDF of Age
counts, bin_edges = np.histogram(haberman_yes['Age'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges)
#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
plt.show();

#Plot CDF of Op_year
counts, bin_edges = np.histogram(haberman_yes['Op_year'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges)
#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
plt.show();

#Plot CDF of axil_nodes
counts, bin_edges = np.histogram(haberman_yes['axil_nodes'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges)
#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
plt.show();

#### 25th, 50th and 75th percentile

In [None]:
#Box plot and whiskers
sns.boxplot(x='surv_status',y='Age', data=haberman)
plt.show()

sns.boxplot(x='surv_status',y='Op_year', data=haberman)
plt.show()

sns.boxplot(x='surv_status',y='axil_nodes', data=haberman)
plt.show()

In [None]:
sns.violinplot(x="surv_status", y="Age", data=haberman, size=8)
plt.show()

sns.violinplot(x="surv_status", y="Op_year", data=haberman, size=8)
plt.show()

sns.violinplot(x="surv_status", y="axil_nodes", data=haberman, size=8)
plt.show()

Observation:
1. The box plots and violin plots on 'Age' and 'Op_year' are unable to give a reasonable classification.
2. If the number of axil nodes is more than 9, the patient won't survive more than  5 years. 

## Conclusion

EDA on the Haberman dataset shows the following things:
1. The features 'Age' and 'Op_year' are unable to surve the purpose of a reasonable classification.
2. The important feature is 'axil_nodes'. If number of axil nodes is more than 9, the patient won't live more than 5 years 