In [None]:
import pandas as pd
data = pd.read_csv("../input/habermans-survival-data-set/haberman.csv", header=None)
data.columns = ['age','year','nodes','status']

Show first five rows to know about structure

In [None]:
data.head()


Here there are 3 features and 1 class attribute

age   : Age of patient at time of operation

year  : Patient's year of operation 

nodes : Number of positive axillary nodes detected in body

status:(if patient is survived 5 years or longer then 1) and (if patient is not survived 5 years or longer then ) 


In [None]:
print("Shape of Dataframe is ", data.shape)
print("Columns of dataframes are ",data.columns)

In [None]:
print("Number of Null values in each columns")
data.isnull().sum()

* Number of Points for each class

In [None]:

print(data['status'].value_counts())
print("Here dataset is imbalanced because around 2.8 times more people are survived !")

* Here this dataframe describes how data is spreaded

In [None]:
data.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid");
sns.pairplot(data, hue="status", size=3);
plt.show()


Conclusion :
1. if the age is above 80 there is high probablity that one will not be survived
2. if the age is nearer 30 the patient will most probably survive.
3. The age of patient is above 30
4. More number of patients are survived than not survived


* Histogram, PDF, CDF

In [None]:
survived = data.loc[data["status"] == 1]
not_survived = data.loc[data["status"] == 2]

(1) Histogram of age

In [None]:
sns.FacetGrid(data, hue="status", size=5) \
   .map(sns.distplot, "age") \
   .add_legend();
plt.ylabel("Distribution")   
plt.show();

Here it seems that age has gaussian normal distribution for both the classes

(2) Histogram of year

In [None]:
sns.FacetGrid(data, hue="status", size=5) \
   .map(sns.distplot, "year") \
   .add_legend();
plt.ylabel("Distribution")      
plt.show();

Here distributionn of year for both classes overlap on each other.

(3) Histogram of nodes

In [None]:
sns.FacetGrid(data, hue="status", size=5) \
   .map(sns.distplot, "nodes") \
   .add_legend();
plt.ylabel("Distribution")   
plt.show()

Here node has possitive skewed distribution

 * PDF and CDF
 
 (1) PDF and CDF of Age

In [None]:

import numpy as np
counts, bin_edges = np.histogram(data['age'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))

cdf = np.cumsum(pdf)
a , =plt.plot(bin_edges[1:],pdf);
b , =plt.plot(bin_edges[1:], cdf)
plt.ylabel("Probablity")
plt.legend([a,b],['PDF','CDF'])
plt.show()

(2) PDF and CDF of years

In [None]:

import numpy as np
counts, bin_edges = np.histogram(data['year'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))

cdf = np.cumsum(pdf)
a , =plt.plot(bin_edges[1:],pdf);
b , =plt.plot(bin_edges[1:], cdf)
plt.legend([a,b],['PDF','CDF'])
plt.ylabel("Probablity")
plt.show()

(2) PDF and CDF of nodes

In [None]:

import numpy as np
counts, bin_edges = np.histogram(data['nodes'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))

cdf = np.cumsum(pdf)
a ,=plt.plot(bin_edges[1:],pdf);
b, =plt.plot(bin_edges[1:], cdf)
plt.legend([a,b],['PDF','CDF'])
plt.ylabel("Probablity")
plt.show()

Here it seems that 85% of patients have nodes value below 10

Plot PDF and CDF for survived and not_survived.

(1) for survived


In [None]:
# survived
counts, bin_edges = np.histogram(survived['year'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
a, =plt.plot(bin_edges[1:],pdf)
b, =plt.plot(bin_edges[1:], cdf)


#not_survived
counts, bin_edges = np.histogram(not_survived['year'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
c , =plt.plot(bin_edges[1:],pdf)
d , =plt.plot(bin_edges[1:], cdf)
plt.legend([a,b,c,d],['PDF_survived','CDF_survived','PDF_not_survived','CDF_notsurvived'])
plt.ylabel("Probablity")
plt.show();

(2) for age

In [None]:
# survived
counts, bin_edges = np.histogram(survived['age'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
a, = plt.plot(bin_edges[1:],pdf)
b , = plt.plot(bin_edges[1:], cdf)


#not_survived
counts, bin_edges = np.histogram(not_survived['age'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
c, = plt.plot(bin_edges[1:],pdf)
d, =plt.plot(bin_edges[1:], cdf)
plt.legend([a,b,c,d],['PDF_survived','CDF_survived','PDF_not_survived','CDF_notsurvived'])
plt.ylabel("Probablity")
plt.show();

(3) for nodes

In [None]:

counts, bin_edges = np.histogram(survived['nodes'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
a , =plt.plot(bin_edges[1:],pdf)
b , =plt.plot(bin_edges[1:], cdf)


#not_survived
counts, bin_edges = np.histogram(not_survived['nodes'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
cdf = np.cumsum(pdf)
c,= plt.plot(bin_edges[1:],pdf)
d ,=plt.plot(bin_edges[1:], cdf)
plt.legend([a,b,c,d],['PDF_survived','CDF_survived','PDF_not_survived','CDF_notsurvived'])

plt.show();

* Mean ,variance and standard daviation

In [None]:
#Mean
print("Means:")
print("Survived")
print("Average of age" , np.mean(survived["age"]))
print("Average of year" , np.mean(survived["year"]))
print("Average of nodes" , np.mean(survived["nodes"]))

print("Not survived")
print("Average of age" , np.mean(not_survived["age"]))
print("Average of year" , np.mean(not_survived["year"]))
print("Average of nodes" , np.mean(not_survived["nodes"]))


In [None]:
#Medians

print("Median of age " , np.median(data["age"]))
print("Median of year " , np.median(data["year"]))
print("Median of nodes " , np.median(data["nodes"]))

In [None]:
# Quantiles
print("Quantiles of age" , np.percentile(data["age"],np.arange(0, 100, 25)))
print("Quantiles of year" , np.percentile(data["year"],np.arange(0, 100, 25)))
print("Quantiles of nodes" , np.percentile(data["nodes"],np.arange(0, 100, 25)))

In [None]:
#90th Percentiles
print("90th percentile of age" , np.percentile(data["age"],90))
print("90th percentile of year" ,np.percentile(data["year"],90))
print("90th percentile of nodes" , np.percentile(data["nodes"],90))

In [None]:
from statsmodels import robust
print ("Median Absolute Deviation")
print("MAD of age ",robust.mad(data["age"]))
print("MAD of year",robust.mad(data["year"]))
print("MAD of nodes",robust.mad(data["nodes"]))

* Boxplot

In [None]:

sns.boxplot(x='status',y='year', data=data)
plt.show()

Here both the class has same almost distribution

In [None]:
sns.boxplot(x='status',y='age', data=data)
plt.show()

Here both the class has almost same distribution

In [None]:
sns.boxplot(x='status',y='nodes', data=data)
plt.show()

nodes has more spread for not_survived class.

* Violin plot

In [None]:

sns.violinplot(x="status", y="age", data=data, size=8)
plt.show()

In [None]:
sns.violinplot(x="status", y="year", data=data, size=8)
plt.show()

In [None]:
sns.violinplot(x="status", y="nodes", data=data, size=8)
plt.show()

Conclusion :
1. if the age is above 80 there is high probablity that one will not be survived
2. if the age is nearer 30 the patient will most probably survive.
3. The age of patient is above 30
4. More number of patients are survived than not survived
5. Age has gaussian normal distribution.
6. Node has possitive skewed distribution.
7. Here it seems that 85% of patients have nodes value below 10
