In [None]:
#Importing the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#load the dataset
columns = ['age', 'year', 'nodes', 'status']
df = pd.read_csv('../input/habermans-survival-data-set/haberman.csv',header=None, names=columns)
print(df.head())

In [None]:
#Summary about the dataset
df.info()

### Observation

* Since there is no missing value there is no need for imputation
* Status column is an integer value that needs to be converted into categorical value i.e "Yes" or "No".

In [None]:
#Replacing the status column with a meaningful data
df.loc[df.status == 1,'status'] = 'yes'
df.loc[df.status == 2,'status'] = 'no'
print(df.info)

In [None]:
#Getting statistics about dataset
df.describe()

#Status count
print((df.iloc[:,-1].value_counts()))

### Observation

* Even though the max positive nodes are 52, we can observe that 75% of patients only have 5 positive nodes
* Since there are 255 yes and 81 no, we can observe that the data is imbalanced

# Analysis

In [None]:
#Probability density function is a function that gives the probability that a random variable has a value x.
sns.FacetGrid(df, hue="status",size=5) \
   .map(sns.distplot, "age") \
   .add_legend();
plt.show();

In [None]:
sns.FacetGrid(df, hue="status",size=5) \
   .map(sns.distplot, "year") \
   .add_legend();
plt.show();

In [None]:
sns.FacetGrid(df, hue="status",size=5) \
   .map(sns.distplot, "nodes") \
   .add_legend();
plt.show();

In [None]:
# The cumulative distribution function (cdf) is the probability that the variable takes a value less than or equal to x.
counts, bin_edges = np.histogram(df['nodes'], bins=10,density = True)
pdf = counts/(sum(counts))
#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], cdf)
plt.plot(bin_edges[1:],pdf)
plt.show();

In [None]:
counts, bin_edges = np.histogram(df['age'], bins=10,density = True)
pdf = counts/(sum(counts))
#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], cdf)
plt.plot(bin_edges[1:],pdf)
plt.show();

In [None]:
counts, bin_edges = np.histogram(df['year'], bins=10,density = True)
pdf = counts/(sum(counts))
#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], cdf)
plt.plot(bin_edges[1:],pdf)
plt.show();

In [None]:
#Box Plots
sns.boxplot(x='status',y='age', data=df)
plt.show()

In [None]:
#Box Plots
sns.boxplot(x='status',y='nodes', data=df)
plt.show()

In [None]:
#Box Plots
sns.boxplot(x='status',y='year', data=df)
plt.show()

In [None]:
#Violin Plot
sns.violinplot(x="status", y="age", data=df, size=5)
plt.show()

In [None]:
sns.violinplot(x="status", y="nodes", data=df, size=5)
plt.show()

In [None]:
sns.violinplot(x="status", y="year", data=df, size=5)
plt.show()

In [None]:
#3-D Pair Plot
sns.pairplot(df, hue="status",size=4);
plt.show()

## Observation

* We can see that almost 80% of the patients have only 0-5 positive lymph nodes (PDF and CDF of Nodes).
* Patients treated after 1966 have a slighter higher chance of surviving compared to the rest (Box Plot of Years).
