# EDA on Haberman Cancer Survival dataset

## Objective - From given dataset we need to determine how we can classify the status of a patient from given observation

## Steps 

### 1. importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

### 2. Reading the file

In [None]:
df = pd.read_csv("../input/haberman.csv/haberman.csv")
df.head()

### 3. Analyzing the dataset

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.status.value_counts()

##### Observation - 
1. It has 306 rows and 4 columns.
2. There are 0 null values ineach column and every column is of type intiger
3. There are 3 independent variables and one dependent variable which is Status.
4. There are 2 categories in status variable. 1&2.
5. The ratio of the Statuas variable is 225:81. which tells us that data is imbalance. We would need to balance the dataset for model creation
6. As the number of datas for status 1 is more than status 2 we can assume status 1 is for them who survived breast cancer and status 2 are those who didn't survived So survival rate is 73.17% 

### 4. Uni-variate Analysis

#### Age

##### PDF - CDF

In [None]:
#combined
counts, bins = np.histogram(df.age,bins = 10)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)


#plotting PDF and CDF
plt.plot(bins[1:],pdf,label = "PDF")
plt.plot(bins[1:],cdf,label = "CDF")
plt.title("Age - PDF & CDF")
plt.xlabel("Age")
plt.ylabel("Probability")
plt.legend()
plt.show()

In [None]:
# preparing PDF and CDF
# Status -1
counts1, bins1 = np.histogram(df[df.status == 1].age,bins = 10)
pdf1 = counts1/sum(counts1)
cdf1 = np.cumsum(pdf1)

# Status -2
counts2, bins2 = np.histogram(df[df.status == 2].age,bins = 10)
pdf2 = counts2/sum(counts2)
cdf2 = np.cumsum(pdf2)




#plotting PDFS and CDFS
plt.figure(figsize=[10,5])
#plt.subplot(121)
plt.plot(bins1[1:],pdf1,label = "PDF- Status- 1")
plt.plot(bins1[1:],cdf1,label = "CDF- Status- 1")

#plt.subplot(122)
plt.plot(bins2[1:],pdf2,label = "PDF- Status- 2")
plt.plot(bins2[1:],cdf2,label = "CDF- Status- 2")
plt.title("Age - PDF & CDF - Separated view")
plt.xlabel("Age")
plt.ylabel("Probability")
plt.legend()
plt.show()

In [None]:
#plotting them in different plots
plt.figure(figsize=[10,5])
plt.subplot(121)
plt.plot(bins1[1:],pdf1,label = "PDF- Status- 1")
plt.plot(bins1[1:],cdf1,label = "CDF- Status- 1")
plt.xlabel("Age")
plt.ylabel("Probability")
plt.title(1)
plt.legend()


plt.subplot(122)
plt.plot(bins2[1:],pdf2,label = "PDF- Status- 2")
plt.plot(bins2[1:],cdf2,label = "CDF- Status- 2")
plt.xlabel("Age")
plt.ylabel("Probability")
plt.title(2)
plt.suptitle("Age - PDF & CDF - Compared view")
plt.legend()
plt.show()

##### Boxplot

In [None]:
sns.boxplot(x= 'status', y = 'age',data= df)
plt.title("Age - Boxplot")
plt.show()

##### Violinplot

In [None]:
sns.violinplot(x= 'status', y = 'age',data= df)
plt.title("Age - Violinplot")
plt.show()

###### observations:
 1. All female with age less than 38 have survived.
 2. All female with age more than 78 have not survived breast cancer 
 3. Female of age group 50 - 56 are prone to suffer from breast cancer
 
 * Age is not a great feature for classification

#### Year

##### PDF - CDF

In [None]:
#combined
counts, bins = np.histogram(df.year,bins = 10)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)


#plotting PDF and CDF
plt.plot(bins[1:],pdf,label = "PDF")
plt.plot(bins[1:],cdf,label = "CDF")
plt.title("Year - PDF & CDF")
plt.xlabel("Year")
plt.ylabel("Probability")
plt.legend()
plt.show()


In [None]:
# preparing PDF and CDF
# Status -1
counts1, bins1 = np.histogram(df[df.status == 1].year,bins = 10)
pdf1 = counts1/sum(counts1)
cdf1 = np.cumsum(pdf1)

# Status -2
counts2, bins2 = np.histogram(df[df.status == 2].year,bins = 10)
pdf2 = counts2/sum(counts2)
cdf2 = np.cumsum(pdf2)




#plotting PDFS and CDFS
plt.figure(figsize=[10,5])
#plt.subplot(121)
plt.plot(bins1[1:],pdf1,label = "PDF- Status- 1")
plt.plot(bins1[1:],cdf1,label = "CDF- Status- 1")

#plt.subplot(122)
plt.plot(bins2[1:],pdf2,label = "PDF- Status- 2")
plt.plot(bins2[1:],cdf2,label = "CDF- Status- 2")
plt.title("Year - PDF & CDF - Separated view")
plt.xlabel("Year")
plt.ylabel("Probability")
plt.legend()
plt.show()


In [None]:
#plotting them in different plots
plt.figure(figsize=[10,5])
plt.subplot(121)
plt.plot(bins1[1:],pdf1,label = "PDF- Status- 1")
plt.plot(bins1[1:],cdf1,label = "CDF- Status- 1")
plt.xlabel("Year")
plt.ylabel("Probability")
plt.title(1)
plt.legend()


plt.subplot(122)
plt.plot(bins2[1:],pdf2,label = "PDF- Status- 2")
plt.plot(bins2[1:],cdf2,label = "CDF- Status- 2")
plt.xlabel("Year")
plt.ylabel("Probability")
plt.title(2)
plt.suptitle("Year - PDF & CDF - Compared view")
plt.legend()
plt.show()

##### Boxplot

In [None]:
sns.boxplot(x= 'status', y = 'year',data= df)
plt.title("Year - Boxplot")
plt.show()

##### Violinplot

In [None]:
sns.violinplot(x= 'status', y = 'year',data= df)
plt.title("Year - Violinplot")
plt.show()

##### Observations
1. After 60s breast cancer rate has decreased drastically 
2. The Survival rate has also increased at that time
3. from tear 1961 to 1965 death rate has increased than survival rate

* Year is not a great classifier for status

#### Nodes

##### PDF - CDF

In [None]:
#combined
counts, bins = np.histogram(df.nodes,bins = 10)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)


#plotting PDF and CDF
plt.plot(bins[1:],pdf,label = "PDF")
plt.plot(bins[1:],cdf,label = "CDF")
plt.title("Nodes - PDF & CDF")
plt.xlabel("Nodes")
plt.ylabel("Probability")
plt.legend()
plt.show()


In [None]:
# preparing PDF and CDF
# Status -1
counts1, bins1 = np.histogram(df[df.status == 1].nodes,bins = 10)
pdf1 = counts1/sum(counts1)
cdf1 = np.cumsum(pdf1)

# Status -2
counts2, bins2 = np.histogram(df[df.status == 2].nodes,bins = 10)
pdf2 = counts2/sum(counts2)
cdf2 = np.cumsum(pdf2)




#plotting PDFS and CDFS
plt.figure(figsize=[10,5])
#plt.subplot(121)
plt.plot(bins1[1:],pdf1,label = "PDF- Status- 1")
plt.plot(bins1[1:],cdf1,label = "CDF- Status- 1")

#plt.subplot(122)
plt.plot(bins2[1:],pdf2,label = "PDF- Status- 2")
plt.plot(bins2[1:],cdf2,label = "CDF- Status- 2")
plt.title("Nodes - PDF & CDF - Separated view")
plt.xlabel("Nodes")
plt.ylabel("Probability")
plt.legend()
plt.show()


In [None]:
#plotting them in different plots
plt.figure(figsize=[10,5])
plt.subplot(121)
plt.plot(bins1[1:],pdf1,label = "PDF- Status- 1")
plt.plot(bins1[1:],cdf1,label = "CDF- Status- 1")
plt.xlabel("Nodes")
plt.ylabel("Probability")
plt.title(1)
plt.legend()


plt.subplot(122)
plt.plot(bins2[1:],pdf2,label = "PDF- Status- 2")
plt.plot(bins2[1:],cdf2,label = "CDF- Status- 2")
plt.xlabel("Nodes")
plt.ylabel("Probability")
plt.title(2)
plt.suptitle("Nodes - PDF & CDF - Compared view")
plt.legend()
plt.show()

##### Boxplot

In [None]:
sns.boxplot(x= 'status', y = 'nodes',data= df)
plt.title("Nodes - Boxplot")
plt.show()

##### Violinplot

In [None]:
sns.violinplot(x= 'status', y = 'nodes',data= df)
plt.title("Nodes - Violinplot")
plt.show()

##### observation
1. Most of the patiients had cancer in less than 10 nodes
2. Lesser the nodes higher the chance of survival

* We can't classify status with nodes alone

### 5. Bi-variate Analysis

#### Age vs Year

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(df, hue = 'status', size = 6)\
   .map(plt.scatter, "age",'year')\
   .add_legend()
plt.title("Age vs. Year")
plt.show()

##### Observations
1. between the year 1965 and 1966 the survival rate was more than death rate

* We didn't find any pattern here to classify them into groups

#### Age vs Nodes

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(df, hue = 'status', size = 6)\
   .map(plt.scatter, "age",'nodes')\
   .add_legend()
plt.title("Age vs. Nodes")
plt.show()

##### Observation

* We didn't find any pattern here to classify them into status 

#### nodes vs Year

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(df, hue = 'status', size = 6)\
   .map(plt.scatter, "nodes",'year')\
   .add_legend()
plt.title("Nodes vs. Year")
plt.show()

##### Observations

* We didn't find any any pattern here to classify them

#### Pairplot

In [None]:
sns.pairplot(df, hue = 'status')
plt.suptitle("Haverman - Pairplot", y=1.005)
plt.show()

###### Observation
1. We can see than status 1 and staus 2 are not separable in any plot. we can't classify them in any of these plots

### 6. Multivariate Analysis

In [None]:
#Age vs. Year vs. Status

px.scatter_3d(df, x = "age", y = "nodes", z = "year", color='status')



###### Observation

* After plotting all the variables together we couldn't find any pattern to classify them into ststus

## Conclusion

* We can't classify the status based on these 3 variables. 
* We would need more variables to classify them.