In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis : *Haberman’s Cancer Survival Dataset*

# About Dataset : 

1. Title: Haberman's Survival Data

2. Sources:
   (a) Donor:   Tjen-Sien Lim (limt@stat.wisc.edu)
   (b) Date:    March 4, 1999

3. Past Usage:
   1. Haberman, S. J. (1976). Generalized Residuals for Log-Linear
      Models, Proceedings of the 9th International Biometrics
      Conference, Boston, pp. 104-122.
   2. Landwehr, J. M., Pregibon, D., and Shoemaker, A. C. (1984),
      Graphical Models for Assessing Logistic Regression Models (with
      discussion), Journal of the American Statistical Association 79:
      61-83.
   3. Lo, W.-D. (1993). Logistic Regression Trees, PhD thesis,
      Department of Statistics, University of Wisconsin, Madison, WI.

4. Relevant Information:
   The dataset contains cases from a study that was conducted between
   1958 and 1970 at the University of Chicago's Billings Hospital on
   the survival of patients who had undergone surgery for breast
   cancer.

5. Number of Instances: 306

6. Number of Attributes: 4 (including the class attribute)

7. Attribute Information:
   1. Age of patient at time of operation (numerical)
   2. Patient's year of operation (year - 1900, numerical)
   3. Number of positive axillary nodes detected (numerical)
   4. Survival status (class attribute)
         1 = the patient survived 5 years or longer
         2 = the patient died within 5 year

8. Missing Attribute Values: None

 ### Objective : *To classify/predict a patient survival who had undergone surgery for breast cancer*

# Importing important library that we will need

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")

### Loading tha haberman.csv dataset in pandas dataFrame and storing in a variable (df_haberman).


In [None]:
#df_haberman = pd.read_csv('../input/habermans-survival-data-set/haberman.csv')
df_haberman = pd.read_csv('../input/habermans-survival-data-set/haberman.csv',names=['Age', 'Year', 'Axillary nodes dect', 'Survstatus']) 
# Reading the csv file
# Also changing the dataset columns name as bydefault there is no column name in the dataset 

In [None]:
df_haberman.head() # *Prints the 1st 5 entries of dataset*

In [None]:
# Q). How many data-point(no. of rows) and features(no. of columns) are there?
df_haberman.shape

In [None]:
# Q). What are the columns or feature names in dataset ?
df_haberman.columns

In [None]:
# Q). *How many data point or survival for each class in survival status?*
df_haberman['Survstatus'].value_counts()
# *There are 2 classes in Survstatus* 
# *Person who are survived after the surgery* 
# *Person is dead after the surgery* 

# *Balanced vs imbalanced dataset* 
# *Haberman is a partial imbalanced dataset since the number of data points for each speicies are not equal.*
# *What is imbalanced dataset ?* 
# *Ans.) Imagine if we have dataset in which there are only 2 spieces and now if one spieces has 900 datapoints* 
# *and other has 100 only then such a data set is called imbalanced data set.*
# *for example: dataset from hospital having species cancer and non-cancer as* 
# *we know most of datapoint will be non-cancer patient, so this is imbalanced dataset*

# Simple Plotting Tools

In [None]:
## 2-D Scatter-Plot :
df_haberman.plot(kind='scatter',x='Age',y='Axillary nodes dect')

# Key point: Always see the labels and scale of graph
# cannot make much sense out it after plotting
# Now we will do colorthe points by their class-labels/survival-status

In [None]:
# We can also use seaborn library to make similiar plot
# A seaborn jointplot shows bivariate scatterplots and univariate histograms in the same figure.
sns.jointplot(x='Age',y='Axillary nodes dect',data=df_haberman)

In [None]:
# 2-D scatter plot with color for each  survival-status
sns.set_style('whitegrid')
sns.FacetGrid(df_haberman,hue='Survstatus',size=12).map(plt.scatter,'Age','Axillary nodes dect').add_legend()
plt.show()
# Survstatus = 1 means survived(blue points)
# Survstatus = 2 means not=survived(orange points)

# Observation :
1. *Using Age and Axillary_nodes_dect feature ,we cant' distuinguish that if a patient has more auxillary nodes then the patient survived or not.*
2. *One thing we can see the patient who have age between 30 and 40 have higher chance of survival after the surgery.*
3. *And patient who have age between 60 and 70 have lower chance of survival after the surgery.*
4. *Patient having 0 nodes have more chances to survive irrespective of their age*

# Since we cant' do 4-D scatter Plot , So there is a hack of Visualizing Data of all data at once
# One such hack is Pair-Plot


In [None]:
# pairwise scatter plot : Pair-Plot
# One disadvanatge of Pair Plot : Cant' be Used when number of features are high.
sns.set_style('whitegrid')
sns.pairplot(df_haberman,hue='Survstatus',size=7)

# Observation :
1. We cant' make analysis from the plot that whether the patient having less age are survived and the patient with less nodes are survived.
2. Only the plot between year and auxillary nodes is comparitvely better.

# 1-D scatter plot using just one Feature

In [None]:
# 1-D scatter plot of "AGE"
survived = df_haberman[df_haberman['Survstatus']==1]
not_survived= df_haberman[df_haberman['Survstatus']==2]
#print(survived.head())
plt.plot(survived['Age'],np.zeros_like(survived['Age']),'ro')
plt.plot(not_survived['Age'],np.zeros_like(not_survived['Age']),'go')
plt.legend(['survived','not_survived'])
plt.show()
# disadvantage of  1-D scatter plot are very hard to read as point are overlapping a lot.

# Histogram and PDF
1. PDF(Probablity Density Function):        
*In probability theory, a probability density function, or density of a continuous random variable, is a function whose value at any given sample in the sample space can be interpreted as providing a relative likelihood that the value of the random variable would equal that sample. It is the measure of the percentage of distribution for a certain range of values.*

In [None]:
# Histogram a better way of visualizing 1-D scatter plots bcz we can tell about the no. of data points present between 2 points
sns.FacetGrid(df_haberman,hue='Survstatus').map(sns.distplot,'Age').add_legend()

# Obervaion:
1. We can't make any clear view of which age the patient surive or not 
2. We can only see that patient less that 40 have higher chance of survival.

In [None]:
# Histogram a better way of visualizing 1-D scatter plots bcz we can tell about the no. of data points present between 2 points
sns.FacetGrid(df_haberman,hue='Survstatus').map(sns.distplot,'Year').add_legend()

# Observation:
1. In this plot we can see that both of tha classes overlap each other. Hence wa cant make any obervation from it

In [None]:
sns.FacetGrid(df_haberman,hue='Survstatus',size=7).map(sns.distplot,'Axillary nodes dect').add_legend()

# Observation:
1. Patient with 0  nodes or 1 nodes have higher chance to survive.
2. Pateint with more than 20 nodes have few chances to surivive.

# CDF (Cumulative Distribution Function):                
*The Cumulative Distribution Function (CDF) is the probability that the variable takes a value less than or equal to x.*

In [None]:
#Plot PDF and CDF of Patient Age.
counts,bin_edges = np.histogram(survived['Age'],bins=10,density=True)
pdf = counts/(sum(counts))
#print("pdf:",pdf);
#print("bins:",bin_edges)
#compute CDF
cdf = np.cumsum(pdf)
#print("cdf:",cdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(['pdf','cdf'])
plt.title(label="Pdf and Cdf of patient' Age who are survived")
plt.xlabel('Age')

#Plot PDF and CDF of Patient Age.
counts,bin_edges = np.histogram(not_survived['Age'],bins=10,density=True)
pdf = counts/(sum(counts))
#print("pdf:",pdf);
#print("bins:",bin_edges)
#compute CDF
cdf = np.cumsum(pdf)
#print("cdf:",cdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend(['pdf','cdf'])
plt.title(label="Pdf and Cdf of patient' Age who are survived")
plt.xlabel('Age')

In [None]:
#Plot PDF and CDF of Patients' Nodes.
counts,bin_edges = np.histogram(survived['Axillary nodes dect'],bins=10,density=True)
pdf = counts/(sum(counts))
print("pdf:",pdf);
print("bins:",bin_edges)
#compute CDF
cdf = np.cumsum(pdf)
print("cdf:",cdf)
plt.plot(bin_edges[1:], cdf)
plt.plot(bin_edges[1:],pdf)
plt.legend(['pdf','cdf'])
plt.title(label="Pdf and Cdf of patient's Axillary nodes dect who are survived and not_survived")
plt.xlabel('Axillary nodes dect')
print('***********************************&********')

#Plot PDF and CDF of Patients' Nodes.
counts,bin_edges = np.histogram(not_survived['Axillary nodes dect'],bins=10,density=True)
pdf = counts/(sum(counts))
print("pdf:",pdf);
print("bins:",bin_edges)
#compute CDF
cdf = np.cumsum(pdf)
print("cdf:",cdf)
plt.plot(bin_edges[1:], cdf)
plt.plot(bin_edges[1:],pdf)
plt.legend(['pdf','cdf'])
plt.title(label="Pdf and Cdf of patient's Axillary nodes dect who are survived")
plt.xlabel('Axillary nodes dect')

# Observation :
1. 83.55% of the Patient who have survived had nodes in the range of 0-5.

#### Such info cannot be understood from pdf , reading the data of cdf we can tell how accurate how our simple if-else model is

# Box Plots:
### Box-plot with Whiskers : another simple method of visualizing the 1-D scatterplot more intuitively
### It uses the concept of mean , median ,Percentile and Quantile

In [None]:
sns.boxplot(x='Survstatus',y='Age',data=df_haberman)
plt.show()
sns.boxplot(x='Survstatus',y='Year',data=df_haberman,)
plt.show()
sns.boxplot(x='Survstatus',y='Axillary nodes dect',data=df_haberman)
plt.show()

# VIOLIN Plot
1. *Violinplot is the combination of the histogram with pdf and Box-Plot.*
1. *Denser region of the data are fatter and Sparser ones are thinner in violin plot.*

In [None]:
sns.violinplot(x='Survstatus',y='Age',data=df_haberman)
plt.show()
sns.violinplot(x='Survstatus',y='Year',data=df_haberman,)
plt.show()
sns.violinplot(x='Survstatus',y='Axillary nodes dect',data=df_haberman,height=10)
plt.show()

# Observations: From above cdf and pdf plots.
1. Patient with less than 40 Age have higher chance of survival .There are comparitvely more patient having age between 45 and 60 who did not survive
2. Patient with more than 1 Nodes have are not likely to survive.More the number of nodes,lesser the survival chances.

# Conclusions:
1. Patient’s age and operation year alone are not deciding factors for his/her survival. Yet, people less than 35 years have more chance of survival.
2. Survival chance is inversely proportional to the number of positive axillary nodes. We also saw that the absence of positive axillary nodes cannot always guarantee survival.
3. The objective of classifying the survival status of a new patient based on the given features is a difficult task as the data is imbalanced.