In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df=pd.read_csv("../input/haberman.csv")

In [None]:
df

In [None]:
df.shape  #so the no. of points are 306 and no. of features are 3

In [None]:
df.columns  #

In [None]:
# Attribute Information:

# column[0]: Age of patient at time of operation (numerical)
# column[1]: Patient's year of operation (year - 1900, numerical)
# column[2]: Number of positive auxillary nodes detected (numerical)
# column[3]: Survival status (class attribute) 1 = the patient survived 5 years or longer , 2 = the patient died within 5 year

In [None]:
df['1.1'].value_counts()  #Imbalanced dataset 
                #There are two classes '1','2'.
                # No. of datapoints belonging to '1' are 224 i.e 224 patients survived 5 years or longer
                # No. of datapoints belonging to '2' are 81 i.e 81 patients died within 5 year

In [None]:
# OBJECTIVE

# Our objective is to find either the patient belong to the "patient survived 5 years or longer" or "he died within 5 year"
# We are to predict column[3] by the given dataset
# and column[2] will be the play the most important role in finding out our conclusion

In [None]:
#Univariate Analysis 

In [None]:
import numpy as np
survived=df.loc[df["1.1"] == 1]
died=df.loc[df["1.1"] == 2]
plt.plot(survived['30'],np.zeros_like(survived['30']),'ro')
plt.plot(died['30'],np.zeros_like(died['30']),'b')
plt.show()

In [None]:
#if age<33:
#    patient survived
#if age>78:
#    patient died
#and due to overlapping we cannot depict for age>33 and age<78

In [None]:
import numpy as np
survived=df.loc[df["1.1"] == 1]
died=df.loc[df["1.1"] == 2]
plt.plot(survived['64'],np.zeros_like(survived['30']),'ro')
plt.plot(died['64'],np.zeros_like(died['30']),'b')
plt.show()

In [None]:
#patient's year of operation can not be a good option to select for univariate analysis

In [None]:
import numpy as np
survived=df.loc[df["1.1"] == 1]
died=df.loc[df["1.1"] == 2]
plt.plot(survived['1'],np.zeros_like(survived['30']),'ro')
plt.plot(died['1'],np.zeros_like(died['30']),'b')
plt.show()

In [None]:
#if Number of positive auxillary nodes detected > 30:
#    then the patient died

In [None]:
#So to choose "1" i.e 'Number of positive auxillary nodes detected' is the perfect univariate.

In [None]:
#PDF

In [None]:
sns.FacetGrid(df,hue='1.1',size=5).map(sns.distplot,'30').add_legend()
plt.show()  #There's a lot of overlapping in it

In [None]:
sns.FacetGrid(df,hue='1.1',size=5).map(sns.distplot,'64').add_legend()
plt.show()  #There's a lot of overlapping in it

In [None]:
sns.FacetGrid(df,hue="1.1",size=5).map(sns.distplot,'1').add_legend()
plt.show()

In [None]:
# PDF,CDF

In [None]:
counts,bin_edges=np.histogram(survived['1'],bins=10,density=True)
pdf=counts/sum(counts)
print(pdf)
print(bin_edges)
cdf=np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf,"ro-")
plt.plot(bin_edges[1:],cdf,"r*-")

counts,bin_edges=np.histogram(died['1'],bins=10,density=True)
pdf=counts/sum(counts)
print(pdf)
print(bin_edges)
cdf=np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf,"go-")
plt.plot(bin_edges[1:],cdf,"g*-")


In [None]:
sns.set_style("darkgrid")
plt.show()

In [None]:
#if Number of positive auxillary nodes detected > 46:
#    Patient died

In [None]:
# boxplot

In [None]:
sns.boxplot(x='1.1',y='1',data=df)
plt.xlabel('Survival status ')
plt.ylabel('Number of positive auxillary nodes detected')
plt.show()

In [None]:
# There are 50% of the died patients who had Number of positive auxillary nodes detected < 4
# There are 75% of the survived patients who had Number of positive auxillary nodes detected < 4

In [None]:
#if we consider the blue box fully,then  the readings are as follows

#if auxiliary nodes< 4:
#    Patient belongs to category 1
#else:
#    Patient belongs to category 2
    
#if we consider the blue box fully,then there will be almost around 40% error

In [None]:
# violinplot

In [None]:
sns.violinplot(x='1.1',y='1',data=df,size=10)
plt.xlabel("Survival Status")
plt.ylabel('Number of positive auxillary nodes detected')
plt.show()

In [None]:
# Bell curve of survived patients is more as compared to died patients

In [None]:
#Bivariate Analysis

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(df,hue='1.1',size=4).map(plt.scatter,'30','1').add_legend()
plt.show()

In [None]:
#if age<40:
#    patient survived
#and we can not seperate for age>40

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(df,hue='1.1',size=4).map(plt.scatter,'1','64').add_legend()
plt.show()

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(df,hue='1.1',size=4).map(plt.scatter,'30','64').add_legend()
plt.show()

In [None]:
#if age<40 and age>70:
#    patient survived

In [None]:
plt.close()
sns.pairplot(df,hue="1.1",size=2)
plt.show()