In [None]:
#The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of 
#Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('/kaggle/input/habermans-survival-data-set/haberman.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#Adding column names to the dataframe
df.columns=['Age','Year','Axil_nodes','Surv_status']

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.nunique()

<h2>Univariate Analysis</h2>

In [None]:
#Histogram, PDF, CDF
sns.FacetGrid(df, hue='Surv_status',size=7).map(sns.distplot,"Age").add_legend();
plt.show()

In [None]:
sns.FacetGrid(df,hue="Surv_status",size=6).map(sns.distplot,"Year").add_legend()

In [None]:
sns.FacetGrid(df, hue="Surv_status",size=7).map(sns.distplot,"Axil_nodes").add_legend()
plt.show()

In [None]:
#CDF
counts, bin_edges = np.histogram(df['Age'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)


plt.show()

In [None]:
counts, bin_edges = np.histogram(df['Year'], bins=10, 
                                 density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)


plt.show()

In [None]:
print("Means:")
print(np.mean(df["Age"]))
print(np.mean(df["Year"]))
print(np.mean(df["Axil_nodes"]))

print("\nMedian:")
print(np.median(df["Age"]))
print(np.median(df["Year"]))
print(np.median(df["Axil_nodes"]))

print("\nStandard Deviation:")
print(np.std(df["Age"]))
print(np.std(df["Year"]))
print(np.std(df["Axil_nodes"]))

<h2>Multivariate Analysis</h2>

In [None]:
df.plot(kind='scatter', x='Age',y='Axil_nodes');

In [None]:
sns.pairplot(df, hue="Surv_status")
plt.show()
#Diagonals are pdf for each of the features
'''Survival status (class attribute)
1 = the patient survived 5 years or longer
2 = the patient died within 5 year'''

In [None]:
df.corr()

<b>Boxplot

In [None]:
sns.boxplot(x='Surv_status', y='Axil_nodes', data=df)
plt.show()

In [None]:
sns.boxplot(x='Surv_status', y='Year', data=df)
plt.show()

In [None]:
sns.boxplot(x='Axil_nodes', y='Age', data=df)
plt.show()

<b>Violin plots<b>

In [None]:
sns.violinplot(x='Surv_status', y="Axil_nodes", data=df)
plt.show()

In [None]:
sns.violinplot(x='Surv_status', y="Age", data=df)
plt.show()

In [None]:
sns.violinplot(x='Surv_status', y="Year", data=df)
plt.show()

<b>Contour plot</b>

In [None]:
sns.jointplot(x='Surv_status',y='Age',data=df,kind='kde');

Observation:  
    The patients who couldn't survive for more than 5 years have age in the range of 40 to 65 years
    The patients who couldn't survive more than 5 years had more auxillary nodes than patient who survied for 5 years.
    
    
Conclusion:   
        The dataset is imbalanced. Most of the data is of patients who survived more than 5 years.
Patients with more than 4 auxillary nodes are more susceptible to die within 5 years.



Thank you!