In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings 
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("/kaggle/input/habermans-survival-data-set/haberman.csv", header = None, names=['age', 'year_of_treatment', 'positive_lymph_nodes', 'survival_5yr'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's 
Billings Hospital on the survival of patients who had undergone surgery for breast cancer.

In [None]:
data.shape # checking the shape of the data
# We have total four columns, 3 features and 1 is target column and n-size is 306

In [None]:
data.isnull().sum() # checking for missing values # no missing value

In [None]:
data.info() # all the columns are int datatype, no categorical value

## Feature information

Copied from prem's kaggle notebook:

age - It represents age of patient at the time of operation(numerical)

year_of_treatment - It represents year of operation(numerical)

positive_lymph_nodes - It tells no of +ve auxillry node detected(numerical)

survival_5yr - Survival status 1 = the patient survived 5 years or longer 2 = the patient died within 5 year

In [None]:
data.head() # checking how does my data look like

In [None]:
data['survival_5yr'].value_counts() # kind of imbalanced dataset, more people survived

In [None]:
# Lets try to find no of unique value in each column

col = data.columns
for i in col:
    print(i,":", len(data[i].value_counts()))
# Comparitevely less unique values in year_of_treatment 
# it will be intresting to see what impact it is having on target variable

## Objective
Our objective is to decide whether based on the data we have, if we can classify the patients will survive or not

In [None]:
data.describe() # quick stats, we will explore more using visulization
# Age and positive_lymph_nodes are highly dispersed looking at their std

In [None]:
data['survival_5yr'].replace(to_replace = 1, value ="yes", inplace = True)
data['survival_5yr'].replace(to_replace = 2, value ="no", inplace = True)
data['survival_5yr'].value_counts()

## Lets start with visulaization
## Univariate analysis

In [None]:
sns.countplot(x = data['survival_5yr']).set_title('Count Plot for Survival_5yr') # again, shows us that we have imbalanced dataset
plt.show()

In [None]:
sns.FacetGrid(data, hue = 'survival_5yr', height = 4, ) \
    .map(sns.distplot, "age") \
    .add_legend() \
    .set_axis_labels("age", "Density")
plt.title('Distribution Plot for Age', y=1.05, fontsize = 12)
plt.show()
# Looking at the distribution plot using age, not much can be inferred

In [None]:
sns.FacetGrid(data, hue = 'survival_5yr', height = 4) \
    .map(sns.distplot, 'year_of_treatment') \
    .add_legend() \
    .set_axis_labels("year_of_treatment", "Density") \
    .fig.suptitle('Distribution Plot for Year of Treatment')
plt.show()
# Looking at the distribution plot using year_of_treatment, not much can be inferred

In [None]:
sns.FacetGrid(data, hue = 'survival_5yr', height = 4) \
    .map(sns.distplot, 'positive_lymph_nodes') \
    .add_legend() \
    .set_axis_labels("positive lymph nodes", "Density") \
    .fig.suptitle('Distribution Plot for Positive Lymph Nodes')
plt.show()
# Looking at the distribution plot using positive_lymph_nodes, around 55% survived patient lie for a positive_lymph_nodes value less than equal to 3

In [None]:
# seeing above dist plots, not much can't be inferred from dist plot of each variable

## Bivariate analysis

In [None]:
# Lets first plot a 2-D scatter plot
sns.FacetGrid(data, hue = 'survival_5yr', height = 4) \
    .map(plt.scatter, "age", "year_of_treatment") \
    .add_legend() 
plt.title('Scatter Plot for year_of_treatment and age', y=1.05, fontsize = 12)
plt.show()
# Clearly based on this two variable we cant have a decision boundary

In [None]:
# Lets first plot a 2-D scatter plot
sns.FacetGrid(data, hue = 'survival_5yr', height = 4) \
    .map(plt.scatter, "age", "positive_lymph_nodes") \
    .add_legend()
plt.title('Scatter Plot positive_lymph_nodes and age', y=1.05, fontsize = 12)
plt.show()
# Clearly based on this two variable we cant have a decision boundary

In [None]:
# Lets first plot a 2-D scatter plot
sns.FacetGrid(data, hue = 'survival_5yr', height = 4) \
    .map(plt.scatter, "year_of_treatment", "positive_lymph_nodes") \
    .add_legend()
plt.title('Scatter Plot for positive_lymph_nodes and year_of_treatment', y=1.05, fontsize = 12)
plt.show()
# Clearly based on this two variable we cant have a decision boundary

## Multi-variate analysis

In [None]:
# Now lets take a look at pairplot
sns.set_style("whitegrid")
sns.pairplot(data, hue = "survival_5yr", height =3)
plt.title('Pair Plot for Existing Features', y=2.2,fontsize = 12, loc = 'right', x= 0.001)
plt.show()
'''Again not a clear decision boundary based on this plot, but we can somewhat draw a non-linear decision boundary based on age and positive_lymph node variable'''

## Creating CDF, PDF

In [None]:
df_yes = data[data['survival_5yr'] == 'yes']
df_no = data[data['survival_5yr'] == 'no']

In [None]:
# Plotting PDF, CDF for 
counts, bin_edges = np.histogram(df_yes['age'], bins = 10, density = True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.title("pdf and cdf for age")
plt.xlabel("age")
plt.ylabel("% of person's")

counts, bin_edges = np.histogram(df_no['age'], bins = 10, density = True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

label = ['pdf for yes', 'cdf for yes', 'pdf for no', 'cdf for no']
plt.legend(label)
plt.show()
# based on the below cdf graph,roughly we can say that younger patient below around 48 years of age are more likely to survive

In [None]:
# Plotting PDF, CDF for 
counts, bin_edges = np.histogram(df_yes['year_of_treatment'], bins = 10, density = True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.title("pdf and cdf for year_of_treatment")
plt.xlabel("year_of_treatment")
plt.ylabel("% of person's")

counts, bin_edges = np.histogram(df_no['year_of_treatment'], bins = 10, density = True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

label = ['pdf for yes', 'cdf for yes', 'pdf for no', 'cdf for no']
plt.legend(label)
plt.show()
# No good insight we are getting from this graph

In [None]:
# Plotting PDF, CDF for 
counts, bin_edges = np.histogram(df_yes['positive_lymph_nodes'], bins = 10, density = True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.title("pdf and cdf for positive_lymph_nodes")
plt.xlabel("positive_lymph_nodes")
plt.ylabel("% of person's")

counts, bin_edges = np.histogram(df_no['positive_lymph_nodes'], bins = 10, density = True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

label = ['pdf for yes', 'cdf for yes', 'pdf for no', 'cdf for no']
plt.legend(label)
plt.show()
# less value of positive_lymph_nodes clearly make survival more probable

## Box-plot and Violin plot

In [None]:
# Lets take a look using boxplot

sns.boxplot(x= 'survival_5yr', y = 'age', data = data).set_title('Box Plot for Age wrt Survival_5yr')
plt.show()
# Not a stark difference but less age makes survival comparitely more probable

In [None]:
sns.boxplot(x= 'survival_5yr', y = 'year_of_treatment', data = data).set_title('Box Plot for year_of_treatment wrt Survival_5yr')
plt.show()
# Higher the no of year in which patient was treated, comparitively more probable are they to survive

In [None]:
sns.boxplot(x= 'survival_5yr', y = 'positive_lymph_nodes', data = data).set_title('Box Plot for positive_lymph_nodes wrt Survival_5yr')
plt.show()
# 75% of the survived are have less than around 3 positive_lymph_nodes
# Also we can roughly say that for less than around 3 positive_lymph_nodes patients generally survive

In [None]:
# Lets take another look using violin plot
sns.violinplot( x = 'survival_5yr', y = 'age', data = data, size =8).set_title('Violin Plot for Age wrt Survival_5yr')
plt.show()

In [None]:
sns.violinplot( x = 'survival_5yr', y = 'year_of_treatment', data = data, size =8).set_title('Violin Plot for Year_of_treatment wrt Survival_5yr')
plt.show()

In [None]:
sns.violinplot( x = 'survival_5yr', y = 'positive_lymph_nodes', data = data, size =8).set_title('Violin Plot for positive_lymph_nodes wrt Survival_5yr')
plt.show()

## Contour plot

In [None]:
sns.jointplot(x = 'age', y =  'positive_lymph_nodes', data = data, hue = 'survival_5yr', kind = 'kde')
plt.title('Contour Plot for Positive_lymph_nodes and Age', y = 1.25, x= 0.007, loc ='right')
plt.show()
# only thing I can infer is that less value of positive_lymph_node makes survival more probable

## Final conclusions
-> Not much can be inferred based on the present data

-> Existing features are not helping us to linearly separate the data

-> One feature from which we can draw most inference is positive_lymph_node, patient with less than around 3 positive lymph node are comparitively more likely to survive

-> We have to apply some complex statistical technique to make a classification, simple if else logic won't work to make a classification