In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings 

warnings.filterwarnings("ignore") 

In [None]:
haberman=pd.read_csv("../input/habermans-survival-data-set/haberman.csv")

### Objective - To predict whether a patient will survive or not based on his/her age,year of operation and problematic nodes

In [None]:
print(haberman.columns)

In [None]:
haberman=pd.read_csv("../input/habermans-survival-data-set/haberman.csv",header=None)

#### Col 0 = Age of Patient
#### Col 1 = Year of Operation
#### Col 2 = Number of problematic nodes detected
#### Col 3 = Survival Status - > 
####             Value is 1; if patient lived for 5 or more years
####             Value is 2; if patient lived for less than 5 years


In [None]:
# CSV does not have meaningful headers. Adding the same

haberman=pd.read_csv("../input/habermans-survival-data-set/haberman.csv",header=None,names=["Age of patient","Year of operation","No. of Problematic nodes","Survived for 5 or more years"])

In [None]:
haberman.shape

In [None]:
haberman.info()
haberman.describe()

#### Observations:
#### 1. No null values present for any of the 4 columns in CSV file
#### 2. Average age of patient is 52 to 53 years
#### 3. Average problematice nodes of patient are 4
#### 4. Median(50% percentile) Age is 52 years 

In [None]:
haberman['Survived for 5 or more years'] = haberman['Survived for 5 or more years'].map({1:"yes", 2:"no"})

In [None]:
# How many yes and Nos

print(haberman.iloc[:,-1].value_counts())


#### Observation
#### 1. More yes than no. So, dataset is not balanced

# Objective - To predict whether a patient will survive or not based on his/her age,year of operation and problematic nodes

In [None]:
# Draw pair plots

plt.close()
sns.set_style("whitegrid")
pairplotGraph=sns.pairplot(haberman,hue="Survived for 5 or more years",size=5)
pairplotGraph.fig.suptitle("Haberman 2d Plot",y=1)

plt.show()

#### Observation
#### 1. Looks like none of parameter combinations gives a clear separation.Compratively, Year of Operation and No. of problematic nodes has a better separation. 


In [None]:

sns.set_style("whitegrid");
g=sns.FacetGrid(haberman,hue="Survived for 5 or more years",size=5).map(plt.scatter,"Year of operation","No. of Problematic nodes").add_legend()
g.fig.suptitle("Problematic Nodes Vs Year of operation")

plt.show()


In [None]:
sns.set_style("whitegrid");
g=sns.FacetGrid(haberman,hue="Survived for 5 or more years",size=5).map(plt.scatter,"No. of Problematic nodes","Year of operation").add_legend()
g.fig.suptitle("Rotate - Problematic Nodes Vs Year of operation")
plt.show()

In [None]:
import plotly.express as px
#haberman_3D = px.data.haberman()
fig = px.scatter_3d(haberman, x='Year of operation', y='No. of Problematic nodes', z='Age of patient',
              color='Survived for 5 or more years')

fig.show()

#### Observation on 3D plot -
#### 1.Not of much help on identifying clear separation
#### 2.The plane with Year of Operations are lesser scattered when comprade with other two parameters

# Univariate Analysis

In [None]:
g=sns.FacetGrid(haberman,hue='Survived for 5 or more years',size=5).map(sns.distplot,"Age of patient").add_legend().set_ylabels('Count')
g.fig.suptitle("Histogram - Age of Patient")
plt.show()

#### Observation on Histogram plot on Age of Patient: Massive overlapping between classes

In [None]:
g=sns.FacetGrid(haberman,hue='Survived for 5 or more years',size=5).map(sns.distplot,"Year of operation").add_legend().set_ylabels('Count')
g.fig.suptitle("Histogram - Year of operation")
plt.show()

#### Observation on Histogram plot on Year of Patient: Medium to high overlapping between classes

In [None]:
g=sns.FacetGrid(haberman,hue='Survived for 5 or more years',size=5).map(sns.distplot,"No. of Problematic nodes").add_legend().set_ylabels('Probability')
g.fig.suptitle("Histogram - No. of Problematice Nodes")
plt.show()

#### Observation on Histogram plot on No. of Problematic nodes: Separation exists between classes. A good choice of parameter towards the objective

# PDF and CDF

In [None]:
haberman_Survived_for_5_or_more_years = haberman.loc[haberman["Survived for 5 or more years"]=="yes"]
haberman_Died_in_5_or_less_years = haberman.loc[haberman["Survived for 5 or more years"]=="no"]

counts,bin_edges=np.histogram(haberman_Survived_for_5_or_more_years['No. of Problematic nodes'],bins=10,density=True)

print(counts)
print(sum(counts))

pdf=counts/sum(counts)
print(pdf)
print(bin_edges)


cdf=np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:],cdf)

plt.ylabel('Probability')
plt.xlabel('No. of Problematic nodes')
plt.title('Pdf/Cdf - Survived for 5 or more years')



plt.show()

#### Observations
#### 1. 80% of patients who survived for more than 5 or more years are less than 4 problematic nodes - PDF
#### 2. 10% of patients who survived for more than 5 or more years are less than 9 problematic nodes - PDF
#### 3. There are around 95% patients who survived for more than 5 or more years of cases with less than 10 nodes - CDF 

In [None]:
# Dataset Survived_for_5_or_more_years
counts,bin_edges=np.histogram(haberman_Survived_for_5_or_more_years['No. of Problematic nodes'],bins=10,density=True)

pdf=counts/sum(counts)

cdf=np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:],cdf)

# Dataset Died in 5 or less years
counts,bin_edges=np.histogram(haberman_Died_in_5_or_less_years['No. of Problematic nodes'],bins=10,density=True)

pdf=counts/sum(counts)

cdf=np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:],cdf)

plt.ylabel('Probability')
plt.xlabel('No. of Problematic nodes')
plt.title('Pdf/Cdf- Haberman dataset')



plt.show()

#### Observation:
#### 1. If a patient is having 11 nodes then 70% is the probability that patient died in 5 or less years

# Mean and Median

In [None]:
print(np.mean(haberman_Survived_for_5_or_more_years['No. of Problematic nodes']))
print(np.mean(haberman_Died_in_5_or_less_years['No. of Problematic nodes']))

print(np.median(haberman_Survived_for_5_or_more_years['No. of Problematic nodes']))
print(np.median(haberman_Died_in_5_or_less_years['No. of Problematic nodes']))

In [None]:
#### Observation on Mean
#### 1. Patient who survided for 5 or more yearsis having a average of 3 problematic nodes
#### 2. Patient who died in 5 or less yearsis having an average of 7 problematic nodes

#### Observation on Median(50th percentile)
#### 1. Patient who survided for 5 or more yearsis having a 0 problematic nodes. This is with factoring any outliers
#### 2. Patient who died in 5 or less yearsis having an average of 4 problematic nodes.This is w/o facting any outliers


# Boxplots and Whiskers

In [None]:
sns.boxplot(x='Survived for 5 or more years',y='No. of Problematic nodes',data=haberman,hue='Survived for 5 or more years',dodge=False).set_title('Overlapping View with Boxplots/Whiskers')
plt.show()

#### Observations:
#### 1. If rule is set y axis as less than 5 then there will mistake on prediction 
#### 2. If rule is set for y=8 then accuracy of prediction of patient dying in 5 or less years is high  

# Violin Plots 

In [None]:
#indicate["yearsofsurvival"] = indicate["value"].isin(["Yes", "No"])

sns.violinplot(x='Survived for 5 or more years',y='No. of Problematic nodes',data=haberman,hue='Survived for 5 or more years',dodge=False).set_title('Violin View')
plt.show()

In [None]:
# Observations
# 1. The curve on patient who who survived for 5 or more years is having highest density for 0 problematice nodes 

## Final Conclusion
### 1. Number of Problematic nodes and Year of operation has better separation than for any other scatter.
### 2. Dataset is not balanced so chances of a accuracy in prediction of a particular class (here, patient who survived more than 5 or more years) is more.