In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Objective** :
- To predict whether or not a patient will survive after 5 year based upon the patient age, year of traetment and number of positive lymph nodes.

In [None]:
heberman= pd.read_csv("../input/habermans-survival-data-set/haberman.csv", header= None, 
                      names= ['age', 'operation_year', 'positive_lymph_nodes', 'survival_status_after_5_years'])
heberman.head()

In [None]:
print(heberman.shape)

In [None]:
heberman['survival_status_after_5_years'].value_counts()

In [None]:
print(heberman.iloc[:,-1].value_counts(normalize = True))

- We can observe that our tatget model is imbalanced as it contains 73%(225/306) values 'yes' and only 27%(81/306) values 'no'.
- Our dataset contains a small number of records(306).

In [None]:
heberman['survival_status_after_5_years'].unique()

In [None]:
heberman.info()

- We can see that there is no missing values in the dataset. So, there is no need of data imputation.
- As, this is a classification problem. So, we have to change the datatype into categoriacal datatype of the column 'survival_status_after_5_years'.
- Let's map our values of 'survial_status_after_5_years' column into 'yes'(Survied) and 'no'(not survived).

In [None]:
heberman['survival_status_after_5_years']= heberman['survival_status_after_5_years'].map({1: 'yes', 2: 'no'})

In [None]:
heberman['survival_status_after_5_years']= heberman['survival_status_after_5_years'].astype('category')
heberman.head(10)

# Let's get a deeper insight:

In [None]:
heberman.describe()

**Observations:**
- Patients age vary from 30 to 83.
- Nearly 25% patient have no positive lymph nodes.
- Max. positive lymph nodes observed is 52.
- Def: If lymph nodes have some cancer cells in them, they are called positive.
- 75% patients have less than 5 positive lymph nodes.

In [None]:
sns.set_style('whitegrid')

# 1- D scatter plot:

In [None]:
one= heberman.loc[heberman['survival_status_after_5_years']== 'yes']
two= heberman.loc[heberman['survival_status_after_5_years']== 'no']
plt.plot(one['age'], np.zeros_like(one['age']), 'o', label= "survival_status_after_5_years, yes")
plt.plot(two['age'], np.zeros_like(two['age']), 'o', label= "survival_status_after_5_years, no")
plt.xlabel('age')
plt.show()


**Observation:**
- persons age between 41-70, tends to very less survive.

# 2-D scatter plot:

In [None]:
sns.FacetGrid(heberman, hue= 'survival_status_after_5_years', size= 6).map(plt.scatter,'age','positive_lymph_nodes').add_legend()

In [None]:
sns.FacetGrid(heberman, hue= 'survival_status_after_5_years', height= 8).map(plt.scatter,'operation_year','positive_lymph_nodes').add_legend()

# Univariate Analysis:(Histogram, PDF, CDF)
- Histograms and Probability Density Functions (PDF) using KDE
- pdf stands for probability density function.
- cdf stands for cummulative density function.We can visually see what percentage of patient have positive lymph nodes less than 5.


In [None]:
sns.FacetGrid(heberman, hue= 'survival_status_after_5_years', height= 5).map(sns.distplot,'age').add_legend()

In [None]:
sns.FacetGrid(heberman, hue= 'survival_status_after_5_years', height= 5).map(sns.distplot,'operation_year').add_legend()

In [None]:
sns.FacetGrid(heberman, hue= 'survival_status_after_5_years', height= 5).map(sns.distplot,'positive_lymph_nodes').add_legend()

In [None]:
# pdf&cdf
counts, bin_edges= np.histogram(heberman['age'], bins= 10, density= True)
pdf= counts/(sum(counts))
print(pdf)
print(bin_edges)
cdf= np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)


In [None]:
# pdf&cdf
counts, bin_edges= np.histogram(heberman['operation_year'], bins= 10, density= True)
pdf= counts/(sum(counts))
print(pdf)
print(bin_edges)
cdf= np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.show()


In [None]:
# pdf&cdf
counts, bin_edges= np.histogram(heberman['positive_lymph_nodes'], bins= 10, density= True)
pdf= counts/(sum(counts))
print(pdf)
print(bin_edges)
cdf= np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.show()

# Pair plot:(Multivariate analysis)

In [None]:
# pairwise scatter plot: Pair-Plot
# Dis-advantages: 
##Can be used when number of features are high.
##Cannot visualize higher dimensional patterns in 3-D and 4-D. 
#Only possible to view 2D patterns.
sns.pairplot(heberman, hue= 'survival_status_after_5_years', height= 3)
plt.show()

**Observation:**
- we can see better sepration between two classes using year and positive_lymph_nodes.

# Mean, Variance and Std-dev:

In [None]:
print("Means:")
print(np.mean(heberman['positive_lymph_nodes']))

print("\nStd dev.:")
print(np.std(heberman['positive_lymph_nodes']))

print("\nVariance:")
print(np.var(heberman['positive_lymph_nodes']))

# Median, Percentile, Quantile, IQR, MAD

In [None]:
print("Median:")
print(np.median(heberman['positive_lymph_nodes']))

print('\nQuantile:')
print(np.percentile(heberman['positive_lymph_nodes'], np.arange(0, 101, 25)))

#90th percentile
print("\nPercentile:")
print(np.percentile(heberman['positive_lymph_nodes'],80))

from statsmodels import robust
print("\nMedian absolute deviation:")
print(robust.mad(heberman['positive_lymph_nodes']))

**Observations:**
- we can see that mostly positive lymph nodes are in 4th quantile.

# Box plot

In [None]:
#Q1- (25th percentile)
#Q2- (50th percentile or median)
#Q3- (75th percentile)
#Q4-  (100th percentile)
#Inter Quartile Range = Q3 -Q1
#whisker len- 1.5*iqr
sns.boxplot(x= 'survival_status_after_5_years', y= 'positive_lymph_nodes', data= heberman)

In [None]:
sns.boxplot(x= 'survival_status_after_5_years', y= 'operation_year', data= heberman)

In [None]:
sns.boxplot(x= 'survival_status_after_5_years', y= 'age', data= heberman)

**Observation:**
- The patients who have positive_lymph_nodes less then 5, tends to survive.
- Almost 80% of the patients have less than or equal to 7 positive_lymph_nodes.

# Violin plots:

In [None]:
# A violin plot combines the benefits of the previous two plots 
#and simplifies them

# Denser regions of the data are fatter, and sparser ones thinner 
#in a violin plot

sns.violinplot(x="survival_status_after_5_years", y="positive_lymph_nodes", data=heberman, size=8)
plt.show()

In [None]:
sns.violinplot(x="survival_status_after_5_years", y="age", data=heberman, size=8)
plt.show()

In [None]:
sns.violinplot(x="survival_status_after_5_years", y="operation_year", data=heberman, size=8)
plt.show()

**Observatin:**
- The patients treated after 1966 have the slighlty higher chance to surive that the rest. The patients treated before 1958 have the slighlty lower chance to surive that the rest. 