# About Dataset: 

### The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.

# Attribute Information:

### a. Age of patient at time of operation (numerical)
### b. Patient's year of operation (year - 1900, numerical)
### c. Number of positive axillary nodes detected (numerical)
### d. Survival status (class attribute):
###      1 = the patient survived 5 years or longer
###      2 = the patient died within 5 year

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [None]:
df_haberman=pd.read_csv(r"../input/habermans-survival-data-set/haberman.csv",names=["Age","Operation_Year","Axil_nodes","Survival_Status_After_5_Years"])
df_haberman.head()

### Mapping Survival_Status_After_5_Years values from 1/2 to Yes/No for better clarity

In [None]:
df_haberman.Survival_Status_After_5_Years= df_haberman.Survival_Status_After_5_Years.map({1:"Yes",2:"No"})
df_haberman.head()

In [None]:
df_haberman.describe()

### Total surgeries = 306
### Age of patients vary from 30-83. Variance is highest for Age
### Operations from year 1958-1969 are considered
### Minimum positive axil nodes is 0 and maximum is 52. Maximum is much higher than 75% percentile value

In [None]:
print("Missing values in Data :")
print(df_haberman.isnull().sum())
df_haberman.isnull().sum().plot()

### No missing values in any column

In [None]:
print("No. of datapoints for each Survival_Status after 5 years class:")
dict_Survival=df_haberman.Survival_Status_After_5_Years.value_counts().to_dict()
for i in dict_Survival:
    print(i,"-",dict_Survival[i])

sns.countplot(df_haberman['Survival_Status_After_5_Years'], color='blue')

### It can be seen that Dataset is imbalanced between the two classes

# Pairplot to visulaize feature relationships

In [None]:
sns.pairplot(df_haberman,hue="Survival_Status_After_5_Years",size=3)

### It can be seen that the classes of Survival_Status are not lineraly separable if only two features are considered and pair plot does not give any idea of decision boundary.
### Lot of overlap is found between the two classes.

# Univariate Analysis

# Plotting PDFs to get more information

In [None]:
print("PDF for Age Column :--")
sns.FacetGrid(df_haberman,hue="Survival_Status_After_5_Years",size=5)\
    .map(sns.distplot,"Age")\
        .add_legend()

### Age feature shows lot of overlap between the two classes.
### It can be seen that people with age 45-60 have undergone the cancer surgery the most.
### PDF is almost normally distributed

In [None]:
print("PDF for Operation_Year Column :--")

sns.FacetGrid(df_haberman,hue="Survival_Status_After_5_Years",size=5)\
    .map(sns.distplot,"Operation_Year")\
        .add_legend()     

### Operation_Year feature shows lot of overlap between the two classes.
### The blue peak shows that the highest number of surgeries of people who survived were performed between 1958-1962.
### The orange peak shows that the highest number of surgeries of people who did not survive were performed between 1963-1967.

In [None]:
print("PDF for Axil_nodes Column :--")
binn=[i*5 for i in range(0,12)]
sns.FacetGrid(df_haberman,hue="Survival_Status_After_5_Years", size=5)\
    .map(sns.distplot,"Axil_nodes",bins=binn)\
        .add_legend()

### Axil_nodes feature shows lot of overlap between the two classes.
### It can be seen that most people who have survived/not survived after the surgery had 0-5 positive axil nodes
### Both the distributions are skewed

# Plotting CDF

In [None]:
# getting data of the histogram
count, bins_count = np.histogram((df_haberman[df_haberman.Survival_Status_After_5_Years=="Yes"]).Age, bins=10,density=True)
  
# finding the PDF of the histogram using count values
pdf = count / sum(count)
  
# using numpy np.cumsum to calculate the CDF
# We can also find using the PDF values by looping and adding
cdf = np.cumsum(pdf)
  
# plotting CDF
plt.plot(bins_count[1:], cdf, label="CDF")
plt.xlabel("Age of people who survived for 5 or more years after Surgery -->",fontdict={'fontsize':10})
plt.legend()
plt.show()


count, bins_count = np.histogram((df_haberman[df_haberman.Survival_Status_After_5_Years=="No"]).Age, bins=10,density=True)
# finding the PDF of the histogram using count values
pdf = count / sum(count)
  
# using numpy np.cumsum to calculate the CDF
# We can also find using the PDF values by looping and adding
cdf = np.cumsum(pdf)
  
# plotting CDF
plt.plot(bins_count[1:], cdf, label="CDF")
plt.xlabel("Age of people who did not survive even for 5 years after Surgery -->",fontdict={'fontsize':10})
plt.legend()
plt.show()

### 80% of the people who had survived had age < 65 years
### 80% of the people who had not survived had age < 65 years

In [None]:
print("Operation year range :-")
seriesYes=(df_haberman[df_haberman.Survival_Status_After_5_Years=="Yes"]).Operation_Year
seriesNo=(df_haberman[df_haberman.Survival_Status_After_5_Years=="No"]).Operation_Year
print(f"People who survived: 19{seriesYes.min()} - 19{seriesYes.max()}")
print(f"People who did not survive: 19{seriesNo.min()} - 19{seriesNo.max()}")
# getting data of the histogram
count, bins_count = np.histogram((df_haberman[df_haberman.Survival_Status_After_5_Years=="Yes"]).Operation_Year, bins=10,density=True)
  
# finding the PDF of the histogram using count values
pdf = count / sum(count)
  
# using numpy np.cumsum to calculate the CDF
# We can also find using the PDF values by looping and adding
cdf = np.cumsum(pdf)
  
# plotting PDF and CDF
# plt.plot(bins_count[1:], pdf, color="red", label="PDF")
plt.plot(bins_count[1:], cdf, label="CDF")
plt.xlabel("Operation_Year of people who survived for 5 or more years after Surgery -->",fontdict={'fontsize':10})
plt.legend()
plt.show()


count, bins_count = np.histogram((df_haberman[df_haberman.Survival_Status_After_5_Years=="No"]).Operation_Year, bins=10,density=True)
# finding the PDF of the histogram using count values
pdf = count / sum(count)
  
# using numpy np.cumsum to calculate the CDF
# We can also find using the PDF values by looping and adding
cdf = np.cumsum(pdf)
  
# plotting PDF and CDF
# plt.plot(bins_count[1:], pdf, color="red", label="PDF")
plt.plot(bins_count[1:], cdf, label="CDF")
plt.xlabel("Operation_Year of people who did not survive even for 5 years after Surgery -->",fontdict={'fontsize':10})
plt.legend()
plt.show()

### Around 75% of the operations were done between 1958-1966 for the people who survived
### Around 82% of the operations were done between 1958-1966 for the people who did not survive

In [None]:
 # getting data of the histogram
count, bins_count = np.histogram((df_haberman[df_haberman.Survival_Status_After_5_Years=="Yes"]).Axil_nodes, bins=10,density=True)
  
# finding the PDF of the histogram using count values
pdf = count / sum(count)
  
# using numpy np.cumsum to calculate the CDF
# We can also find using the PDF values by looping and adding
cdf = np.cumsum(pdf)
  
# plotting PDF and CDF
# plt.plot(bins_count[1:], pdf, color="red", label="PDF")
plt.plot(bins_count[1:], cdf, label="CDF")
plt.xlabel("Axil_nodes of people who survived for 5 or more years after Surgery -->",fontdict={'fontsize':10})
plt.legend()
plt.show()


count, bins_count = np.histogram((df_haberman[df_haberman.Survival_Status_After_5_Years=="No"]).Axil_nodes, bins=10,density=True)
# finding the PDF of the histogram using count values
pdf = count / sum(count)
  
# using numpy np.cumsum to calculate the CDF
# We can also find using the PDF values by looping and adding
cdf = np.cumsum(pdf)
  
# plotting PDF and CDF
# plt.plot(bins_count[1:], pdf, color="red", label="PDF")
plt.plot(bins_count[1:], cdf, label="CDF")
plt.xlabel("Axil_nodes of people who did not survive even for 5 years after Surgery -->",fontdict={'fontsize':10})
plt.legend()
plt.show()

### As stated earlier, the first plot clearly shows that 85% of the people who had survived even after 5 years of surgery had <=5 positive axial nodes.
### The second curve shows that 60% of the people who did not survive had around 0-5 positive axial nodes.

# Outlier Analysis

In [None]:
sns.boxplot(x="Survival_Status_After_5_Years",y="Age",data=df_haberman)
plt.show()

### No Outliers in Age column

In [None]:
sns.boxplot(x="Survival_Status_After_5_Years",y="Operation_Year",data=df_haberman)
plt.show()

### No Outliers in Operation_Year column
### The patients treated after 1965 have the slighlty higher chance to surive that the rest. The patients treated before 1960 have slighlty lower chance to surive that the rest.

In [None]:
sns.boxplot(x="Survival_Status_After_5_Years",y="Axil_nodes",data=df_haberman)
plt.show()

### Outliers are there in Axil_nodes column
### Patients having more than 8 positive axial nodes have less probability of survival.

In [None]:
def detect_outlier_IQR(data):
    Q1=data.quantile(0.25)
    Q3=data.quantile(0.75)
    IQR=Q3-Q1
    data_final=data[~((data<(Q1-1.5*IQR)) | (data>(Q3+1.5*IQR)))]
    return len(data_final)

In [None]:
beforeRemovingOutliersYes=(df_haberman[df_haberman.Survival_Status_After_5_Years=="Yes"].Axil_nodes).count()
afterRemovingOutliersYes=detect_outlier_IQR(df_haberman[df_haberman.Survival_Status_After_5_Years=="Yes"].Axil_nodes)
print("No. of outliers in Axial Nodes column for people who survived for 5 or more years after surgery =",beforeRemovingOutliersYes-afterRemovingOutliersYes)

In [None]:
beforeRemovingOutliersNo=(df_haberman[df_haberman.Survival_Status_After_5_Years=="No"].Axil_nodes).count()
afterRemovingOutliersNo=detect_outlier_IQR(df_haberman[df_haberman.Survival_Status_After_5_Years=="No"].Axil_nodes)
print("No. of outliers in Axial Nodes column for people who died within 5 years after surgery =",beforeRemovingOutliersNo-afterRemovingOutliersNo)