In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
df=pd.read_csv('../input/haberman.csv/haberman.csv')

In [1]:
df.head()

### **High Level Statics**

In [1]:
df.shape

* There are 305 data points and 4 features(age,year,nodes,status) for each data point.

In [1]:
df.status.value_counts()

* There are 2 classes 

    - 1 = Survived (Alive after 5 Years)
    
    - 2 = Not Survived (Died within 5 Years)
    
* This is imbalanced dataset since we have more number of points belongs to class 1 than class 2

In [1]:
df.age.describe()

* You can see that minimum age of the patient is 30 and maximum age is 78 and average age of the patients is around 52.

In [1]:
df.year.value_counts()

* There were 35 tested patients in the year 1958 and only 11 in the year 1969.

In [1]:
df.nodes.describe()

* You can see that the minumum number of positive nodes is found to be 0 and the maximum of number of positve nodes is found to be 52.

* You can also see that 25 percent of the patients has 0 positive nodes.

* 50 percent of the patients has number of nodes $\le1$

* The average number of positive nodes that a patient has is found to be 4.

### Univariate Analysis(PDF, CDF, Box plot, Violin Plot)

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

## Univariate analysis on Age

In [1]:
counts,bin_edges=np.histogram(df.age,bins=10,density=False)
print(bin_edges)
pdf=counts/sum(counts)
print(pdf)
cdf=np.cumsum(pdf)
print(cdf)

plt.plot(bin_edges[1:],pdf,label='pdf')
plt.plot(bin_edges[1:],cdf,label='cdf')
plt.legend()
sns.set_style('whitegrid')
sns.FacetGrid(df,hue='status',height=5).map(sns.distplot,'age').add_legend()
plt.show()

* You can clearly see that all the patients with age $\le35$ are survived. 

* And majority of the of the people with age $\le40$ are survived.

* Patients with $40\le age \le60 $ are at high risk and have equal chances of survival or non-survival.

## Univariate analysis on Year

In [1]:
counts,bin_edges=np.histogram(df.year,bins=10,density=False)
print(bin_edges)
pdf=counts/sum(counts)
print(pdf)
cdf=np.cumsum(pdf)
print(cdf)

plt.plot(bin_edges[1:],pdf,label='pdf')
plt.plot(bin_edges[1:],cdf,label='cdf')
plt.legend()
sns.set_style('whitegrid')
sns.FacetGrid(df,hue='status',height=5).map(sns.distplot,'year').add_legend()
plt.show()

* you can see that more number of tests were performed between the years 58 and 60 and the majority of the tested patients in this period were not survived.

* During the years 60 to 62 majority of the tested people survived.

## Univariate analysis on Nodes

In [1]:
counts,bin_edges=np.histogram(df.nodes,bins=10,density=False)
print(bin_edges)
pdf=counts/sum(counts)
print(pdf)
cdf=np.cumsum(pdf)
print(cdf)

plt.plot(bin_edges[1:],pdf,label='pdf')
plt.plot(bin_edges[1:],cdf,label='cdf')
plt.legend()
sns.set_style('whitegrid')

sns.FacetGrid(df,hue='status',height=5).map(sns.distplot,'nodes').add_legend()
plt.show()

plt.figure(2)
sns.boxplot(data=df,y='nodes',x='status')

plt.figure(3)
sns.violinplot(data=df,y='nodes',x='status',size=5)
plt.show()

* From the box plot you can see that most of the survived people has 0 nodes or nodes<=1

* And if the number of nodes is greater than 4 the patient has less chance of survival than non survival

In [1]:
nodes_1=len(df[df['status']==1][df.nodes<=1])

total=len(df[df.nodes<=1])

def survival_percentage(nodes_n,total):
    survival_percentage= (nodes_n/total)*100
    return survival_percentage
print('for nodes<=1 survival_percentage is {}'.format(survival_percentage(nodes_1,total)))

In [1]:
nodes_0=len(df[df['status']==1][df.nodes==0])
total=len(df[df.nodes==0])
print('for nodes=0 survival_percentage is {}'.format(survival_percentage(nodes_0,total)))

In [1]:
nodes_4=len(df[df['status']==1][df.nodes>4])
total=len(df[df.nodes>4])
print('for nodes>4 survival_percentage is {}'.format(survival_percentage(nodes_4,total)))

# Pair Plots

In [1]:
df.status=df.status.astype('str')

In [1]:
sns.set_style('whitegrid')
sns.pairplot(df,hue='status')
plt.show()

* Pair plots (combinations of the features) are not giving any additional information than univariate plots that we have seen earlier to draw insights from the data.