In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings('ignore')

#Load Haberman.csv into pandas dataFrame.
haberman = pd.read_csv("../input/habermans-survival-data-set/haberman.csv", names = 
                       ['Patient_age', 'Year_of_Operation', 'Auxilary_Node',
                        'Survival_status'])

In [None]:
# how many data-points and features?
print(haberman.shape)

In [None]:
# What are the column names in our dataset?
print(haberman.columns)

In [None]:
haberman.head()

# Attribute Information:
1. Age of patient at time of operation.
2. Patient's year of operation.
3. Number of positive axillary nodes detected.
4. Survival status (class attribute)

In [None]:
haberman.info()

# Observation

1. There are 306 rows and  4 columns present.
2. All columns have int64 class datatype.

In [None]:
haberman["Survival_status"].value_counts()
#Haberman is a imbalanced dataset as the number of data points for every class is
#not the same.

# 2-D Scatter Plot

In [None]:
haberman.plot(kind='scatter',x = 'Patient_age', y='Auxilary_Node')
plt.title('Scatter Plot of Auxilary_Node vs Patient_age')
plt.show()

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(haberman, hue="Survival_status", size=5)\
    .map(plt.scatter, "Patient_age", "Auxilary_Node")\
    .add_legend()
plt.show()

# Pair-plot

In [None]:
plt.close()
sns.set_style("whitegrid")
sns.pairplot(haberman, hue="Survival_status",size=3)
plt.show()

# Histogram, PDF, CDF

In [None]:
haberman_long_survived = haberman.loc[haberman['Survival_status']== 1]
haberman_short_survived = haberman.loc[haberman['Survival_status']== 2]

plt.plot(haberman_long_survived['Auxilary_Node'], 
         np.zeros_like(haberman_long_survived['Auxilary_Node']), 'o')
plt.plot(haberman_short_survived['Auxilary_Node'], 
         np.zeros_like(haberman_short_survived['Auxilary_Node']), 'o')

plt.xlabel('Auxilary_Node')
plt.title('Scatter Plot of Survival vs Auxilary_Node')

# PDF of Patient_age

In [None]:
sns.FacetGrid(haberman, hue='Survival_status', size=5).map(sns.distplot, 'Patient_age').add_legend()
plt.title('Patient_Survival_morethan_5Years vs Patient_age')
plt.show()

# PDF of year_of_Operation

Here we are implementing "PDF of year fo operation" in this. It will be very difficult to come for any conclusion beacuse of most of the Survival Data Points are Overlapping.

In [None]:
sns.FacetGrid(haberman, hue='Survival_status', size=5).map(sns.distplot,'Year_of_Operation').add_legend()
plt.title('Patient_Survival_morethan_5Years vs Year_of_Operation')

# PDF of Auxilary_Node

In [None]:
sns.FacetGrid(haberman, hue='Survival_status', size=5).map(sns.distplot,'Auxilary_Node').add_legend()
plt.title('Patient_Survival_morethan_5Years vs Auxilary_Node')

# CDF of Long_Survived

In [None]:
counts, bin_edges = np.histogram(haberman_long_survived['Auxilary_Node'], bins=10,
                                density=True)
pdf = counts/(sum(counts))
print(pdf)
print(bin_edges)

cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:],cdf)
plt.legend(['1', '2'])
plt.xlabel('Auxilary_Node')
plt.show()

# CDF of Short_survived

In [None]:
counts, bin_edges = np.histogram(haberman_short_survived['Auxilary_Node'], bins=10,
                                density=True)
pdf = counts/(sum(counts))
print(pdf)
print(bin_edges)

cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:],cdf)
plt.legend(['1', '2'])
plt.xlabel('Auxilary_Node')
plt.show()

In [None]:
#Number of classes:
print(haberman['Survival_status'].unique())

In [None]:
#Data points per class:
print(haberman.groupby('Survival_status').count())

In [None]:
#Dataset through Mean, Variance and Standard deviation:
status_yes=haberman.loc[haberman["Survival_status"]==1]
status_no=haberman.loc[haberman["Survival_status"]==2]

print("SURVIVAL STATUS : YES -> STATISTICS :")
print(status_yes.describe())
print("\n****************************************************************************\n")
print("SURVIVAL STATUS : NO -> STATISTICS :")
print(status_no.describe())


# Analysis of Dataset through Medians, quantiles, median absolute deviation :

In [None]:
print("Meadians: \n")
print(np.median(status_yes["Patient_age"]))
print(np.median(status_no["Patient_age"]))
print(np.median(status_yes["Auxilary_Node"]))
print(np.median(status_no["Auxilary_Node"]))

print("\n**********************************************\n")

print("Quantiles: \n")
print(np.percentile(status_yes["Patient_age"], np.arange(0, 100, 25)))
print(np.percentile(status_no["Auxilary_Node"], np.arange(0, 100, 25)))
print(np.percentile(status_yes["Patient_age"], np.arange(0, 100, 25)))
print(np.percentile(status_no["Auxilary_Node"], np.arange(0, 100, 25)))

print("\n**********************************************\n")

from statsmodels import robust
print("MEDIAN ABSOLUTE DEVIATION :\n")
print("Survival Status : Yes")
print(robust.mad(status_yes["Patient_age"]))
print(robust.mad(status_yes["Auxilary_Node"]))
print("Survival Status : No")
print(robust.mad(status_no["Patient_age"]))
print(robust.mad(status_no["Auxilary_Node"]))

# Box plot and Whuiskers

In [None]:
sns.boxplot(x="Survival_status", y="Patient_age", data=haberman)
plt.title("Box plot for survival_status and Age")
plt.show()

sns.boxplot(x="Survival_status", y="Year_of_Operation", data=haberman)
plt.title("Box plot for survival_status and Year_of_Operation")
plt.show()

sns.boxplot(x="Survival_status", y="Auxilary_Node", data=haberman)
plt.title("Box plot for survival_status and Auxillary Nodes")
plt.show()

#  Violin plots

In [None]:
sns.violinplot(x="Survival_status", y="Patient_age", data=haberman)
plt.title("Violin plot for survival_status and Age")
plt.show()

sns.violinplot(x="Survival_status", y="Year_of_Operation", data=haberman)
plt.title("Violin plot for survival_status and Year_of_Operation")
plt.show()

sns.violinplot(x="Survival_status", y="Auxilary_Node", data=haberman)
plt.title("Violin plot for survival_status and Auxillary Node")
plt.show()

In [None]:
#Contour plot:
sns.jointplot(x="Patient_age", y="Year_of_Operation", data=haberman, kind='kde')
plt.show()

sns.jointplot(x="Patient_age", y="Auxilary_Node", data=haberman, kind='kde')
plt.show()

sns.jointplot(x="Year_of_Operation", y="Auxilary_Node", data=haberman, kind='kde')
plt.show()

# TOTAL OBSERVATIONS :

1. It's a Binary Classification Problem, We need to predict whether the patient will survive after 5 years or not based upon the patient's age, year of operation and the number of positive auxillary nodes.
2. Dataset is not balanced but complete as no single value is missing.
3. People with age range 40-60 have survived the most.
4. Operation year 60 had highest survival rate.
5. Operation year having range 63-66 had lowest survival rate.
6. Auxillary node=0 has the highest Survival rate.
7. Patients between age of range 30-34 survived after the treatment.
8. Patients with age > 77 were not able to survive.
9. Patients with Age < 40 and Auxillary nodes < 30 have higher chances of survival.
10. Patients with Age > 50 and Auxillary nodes > 10 has less chances of survival.
11. We can conclude from all the Pair Plots that they are not linearly Separable.
