In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
haberman=pd.read_csv('../input/haberman.csv/haberman.csv')

# High level statistics

In [None]:
haberman.shape

In [None]:
haberman.columns

In [None]:
haberman.status.unique()

In [None]:
haberman.status = haberman.status.map({1:'yes', 2:'no'})
haberman.status.value_counts()

In [None]:
haberman.info()

In [None]:
haberman.describe()

## Obervations
1. Total 305 data points available. There is no missing data point.
2. **Features:** age, year and nodes.
3. **Class:** status, 2 classes (*yes means survived 5 or more than years while 2 means survived less than 5 years.*)
4. 81 patients survived less than 5 years and 225 patients survived 5 or more than 5 years. So it's an imbalanced data set.
5. 50% of people operated are between 44 to 60 years of age.
6. 75% of the people have 4 or less lymph nodes.
7. **The real objective is to find whether a person survives more than 5 years or not**.

# Univariate Analysis

In [None]:
for i in ['age','nodes','year']:
  sns.FacetGrid(haberman, hue="status", height=5) \
   .map(sns.histplot, i, kde=True);
  plt.xlabel(i.capitalize())
  plt.title('Distribution plot for '+i.capitalize())
  plt.legend()
  plt.grid()
  plt.show()

In [None]:
proportion, bin_edges = np.histogram(haberman.nodes[haberman.status=='yes'], density=True)
pdf = proportion/sum(proportion)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf, label="PDF for status=yes")
plt.plot(bin_edges[1:], cdf, label="CDF for status=yes")
proportion, bin_edges = np.histogram(haberman.nodes[haberman.status=='no'], density=True)
pdf = proportion/sum(proportion)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf, label="PDF for status=no")
plt.plot(bin_edges[1:], cdf, label="CDF for status=no")
plt.title('PDF and CDF of nodes')
plt.legend()
plt.xlabel('Nodes')
plt.ylabel('Probability')
plt.grid()
plt.show()

In [None]:
sns.boxplot(x='status', y='nodes', data=haberman)
plt.title('Box plot of nodes')
plt.xlabel('Status')
plt.ylabel('Nodes')
plt.grid()
plt.show()

In [None]:
haberman.nodes[haberman.status=='yes'].describe()

In [None]:
haberman.nodes[haberman.status=='no'].describe()

In [None]:
sns.violinplot(x='status', y='nodes', data=haberman)
plt.title('Violin plot of nodes')
plt.xlabel('Status')
plt.ylabel('Nodes')
plt.grid()
plt.show()

In [None]:
sns.stripplot(x='status', y='nodes', data=haberman)
plt.title('Swarm plot of nodes')
plt.xlabel('Status')
plt.ylabel('Nodes')
plt.grid()
plt.show()

In [None]:
sns.swarmplot(x='status', y='nodes', data=haberman, size=1)
plt.title('Swarm plot of nodes')
plt.xlabel('Status')
plt.ylabel('Nodes')
plt.grid()
plt.show()

## Observations


1.   Patients with 1 or less node are more likely to survive.
2.   Survival chances are lower if age is more than 40
3.   82% of the people who survived have less than equal to 4 nodes
4.   75% of the people who survived have less than equal to 3 nodes
5.   50% of the people who did not survive have more than equal to 4 nodes



# Bi-variate Analysis

In [None]:
g = sns.pairplot(haberman, hue='status')
g.fig.suptitle("Pair plot for haberman data", y=1.05)
plt.show()

In [None]:
sns.scatterplot('age', 'nodes', data=haberman, hue='status')
plt.title('Scatter plot of age vs nodes')
plt.xlabel('Age')
plt.ylabel('Nodes')
plt.legend()
plt.grid()
plt.show()

## Observations


1.   Only **age** vs **node** graph has little bit seperation.
2.   Lower is the number of nodes the greater is the chance of survival



# Conclusion


1.   Patients less than 40 years are more likely to survive.
2.   Patients with less number of nodes are more likely to survive.
3.   However in case of number of nodes both survived and not survived percentage are higher at less number of nodes. Because 75% of the patients have 4 or less nodes.
4.   The data set is imbalanced and none of the features alone or combined are making any sense. So we can not create an algo for this classification problem.

