# Exploratory data analysis (EDA) on Haberman Data set

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")

df=pd.read_csv('../input/haberman.csv')
print(df.shape)
print(df.columns)   # there are 4 data points


In [None]:
df=pd.read_csv('../input/haberman.csv',names=[ 'Age','Op_Year','axil_nodes_det','Surv_status'])
df['Surv_status'].value_counts()

In [None]:
df.info()  #The data set has no missing values.

In [None]:
print('Mean of age of people who survived more than 5 yaers is '+str(np.mean(df[df['Surv_status']==1]['Age'])))
print("Mean of age of people who can't survived more than 5 yaers is "+str(np.mean(df[df['Surv_status']==2]['Age'])))

In [None]:
df.describe()

# Univariate analsis

In [None]:
sns.FacetGrid(df,hue='axil_nodes_det',size=5) \
    .map(sns.distplot,'Age').add_legend()
plt.show()
# for 1st objective

In [None]:
sns.FacetGrid(df,hue='Surv_status',size=5) \
    .map(sns.distplot,'Age').add_legend()
plt.show()
# for 2nd objective

The person below Age 41 has the more probability of living has most of them as survival status 1

In [None]:
sns.FacetGrid(df,hue='Surv_status',size=5) \
    .map(sns.distplot,'axil_nodes_det').add_legend()    # less auxilary nodes detected have high survival rate
plt.show()
#for 3rd objective

From above graph we can conclude that at point 3 or 4 (point less than 5 )the survival rate is more .We can conclude that auxillary nodes less than 5 has high survival rate

### PDF and CDF

In [None]:
# let us make survival status with 1 as survived and 2 as not survived for simplifying data visualiztion

df_survived = df[df['Surv_status']==1]
df_not_survived = df[df['Surv_status']==2]
j=1
cols=df.columns[:3]
# making a loop for plotting histogram for 3 columns and making pdf,cdf plots for both survival and who has not survived so we can compare and draw conclusions
for i in list(cols):
    plt.figure(figsize=(10,10))
    print(str(j)+"\t"+i)
    plt.subplot(3,1,j)
    counts, bin_edges = np.histogram(df_survived[i], bins=20,density = True)
    pdf = counts/(sum(counts))
    cdf = np.cumsum(pdf)
    plt.plot(bin_edges[1:],pdf,color='orange',label='pdf of survived')
    plt.plot(bin_edges[1:], cdf,color='red',label='cdf of survived')
    
    counts, bin_edges = np.histogram(df_not_survived[i], bins=20,density = True)
    pdf = counts/(sum(counts))
    cdf = np.cumsum(pdf)
    plt.plot(bin_edges[1:],pdf,color='black',label='pdf of not survived')
    plt.plot(bin_edges[1:], cdf,color='blue',label='cdf of not survived')
    
    
    plt.grid()
    plt.legend()
    plt.show()
    j=j+1

In [None]:
print("\n")
print(df_survived.min())  
print("\n")
print(df_not_survived.min())   
print("\n")
print(df_not_survived.max())   
print("\n")
print(df_survived.max())   

In [None]:
#Median, Quantiles, Percentiles, IQR.
from statsmodels import robust

cols=df.columns
print('Survived \n')
for col in cols:
    print(col)
    print("Medians:"+str(np.median(df_survived[col])))
    print("Quantiles:"+str(np.percentile(df_survived[col],np.arange(0, 100, 25))))
    print("90th Percentiles:"+str(np.percentile(df_survived[col],90)))
    print ("Median Absolute Deviation:"+str(robust.mad(df_survived[col])))
    print()


In [None]:

cols=df.columns
print('Not Survived \n')
for col in cols:
    print(col)
    print("Medians:"+str(np.median(df_not_survived[col])))
    print("Quantiles:"+str(np.percentile(df_not_survived[col],np.arange(0, 100, 25))))
    print("90th Percentiles:"+str(np.percentile(df_not_survived[col],90)))
    print ("Median Absolute Deviation:"+str(robust.mad(df_not_survived[col])))
    print()

## Box Plots

In [None]:
sns.boxplot(x='Surv_status',y='axil_nodes_det', data=df)
plt.show()

In [None]:

plt.figure(figsize=(30,5))
plt.subplot(1,2,1)
sns.boxplot(x='axil_nodes_det',y='Age',hue='Surv_status', data=df)
plt.subplot(1,2,2)
sns.boxplot(x='Surv_status',y='Age', data=df)
plt.show()

## Violin plots

In [None]:
j=1
for col in cols[:3]:
    plt.plot(2,2,j)
    sns.violinplot(x='Surv_status',y=col, data=df)
    plt.show()
    j=j+1

##  2 D scatter plots

In [None]:
df.plot(kind='scatter', x='Age', y='axil_nodes_det') ;
plt.show()

In [None]:
sns.set_style("whitegrid");
# coloring the survival status to get the idea of survival status
sns.FacetGrid(df, hue="Surv_status", size=4) \
   .map(plt.scatter, "Age", "axil_nodes_det") \
   .add_legend();
plt.show();

In [None]:
sns.set_style("whitegrid");
sns.FacetGrid(df, hue="Surv_status", size=4) \
   .map(plt.scatter, "Age", "Op_Year") \
   .add_legend();
plt.show();

## Pair plots

In [None]:
plt.close();
sns.set_style("whitegrid");
sns.pairplot(df, hue="Surv_status", size=3);
plt.show()

In [None]:
plt.close();
sns.set_style("whitegrid");
sns.pairplot(df, hue="axil_nodes_det",size=4);
plt.show()

## Multi variate Plot 

In [None]:
j=1
for col in cols:
    if col is 'Age':
        continue
    sns.jointplot(x="Age", y=col, data=df, kind="kde");
    plt.show()
    j=j+1
plt.show()

# Conclusions


Objective: 1) Can we link the age to Number of positive axillary nodes detected
           2) Is the Age any how related to Survival status
           3) Classify survival status based on Number of positive axillary nodes
           
           
1. The data set is imbalance data set . The data set has no missing values.

Mean of age of people who survived more than 5 yaers is 52.01
Mean of age of people who can't survived more than 5 yaers is 53.68

Univariate analysis summary :

2. we can observe that Less Auxilary nodes has more survival rate. 
3. We can conclude that auxillary nodes less than 5 has high survival rate  
    (The above 2 points is required for the 3rd point of objective) 
    
4. The person below Age 41 has the more probability of living has most of them as survival status 1 as more probability than of     survival status 2  
    (The above 2 points is required for the 2nd point of objective) 
    
5. There is no relation between auxilary nodes detection and age.

pdf,cdf Histogram conclusions :

    a) From 1st histogram diagram we can conclude that the age between 30 -34 has definitely survived and the age above 77    has not survived
    b) From 2nd histogram we can't draw any conclusion as most of the data got overlapped
    c) From 3rd histogram we can conclude that people with more than 46 auxilary nodes have not survived


Bivariate analysis summary :
6. We can't draw relation between the age and auxilary nodes detection as most of them are getting merged.
    (so we can't draw conclusion for 1st point in objective)
7. We can observe in the pair plot the data is merging for both pair plot (survival status and auxillary node detection)

Multivariate analysis summary:
8. From Above graph we can conclude that  the people between age 50 and 60 has high density of survival status 1 

For basic understanding of topic and making EDA i have referred this link : https://www.kaggle.com/dristantanirola/eda-on-haberman-cancer-survival-dataset 
