In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# **Haberman's Survival Data Set**

Relevant Information: The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.

## **About the Dataset**

Title: Haberman's Survival Data

Attribute Information:

Age of patient at time of operation (numerical)

Patient's year of operation (year - 1900, numerical)

Number of positive axillary nodes detected (numerical)

Survival status (class attribute) 
    1 = the patient survived 5 years or longer 
    2 = the patient died within 5 year

Missing Attribute Values: None

Data may be found in :
                                    http://mlr.cs.umass.edu/ml/machine-learning-databases/haberman/haberman.data
                                                                            or
                                    https://www.kaggle.com/gilsousa/habermans-survival-data-set

# **1 Loading the data from the Haberman.csv file**

In [None]:
url = url = "../input/haberman.csv"
#since the dataset doesn't contain the Column names we are adding it to the dataset.
names = ['Age', 'Year operation', 'Axillary nodes detected', 'Survival status']
dataset = pd.read_csv(url, names=names)

# **2 Data preparation**

**Printing the shape of the dataset**

In [None]:
dataset.shape

**Printing the column names of the dataset **

In [None]:
dataset.columns

**Printing the top 10 rows of the dataset**

In [None]:
dataset.head(10)

### **Describing the dataset in terms of count, mean, standard deviation min value,max value, 25% value, 50% value, 75% value for each feature(column) of the dataset **

In [None]:
dataset.describe()

In [None]:
dataset['Survival status'].value_counts()

## **Observations:**
* Number of Instances: 306
* Number of Attributes: 4 (including the class attribute)
* The year operation gives the last two digits of the year for each patient.
* The dataset is classified into two classes (Survived-1 and not survived-2).
* 225 patients of class 1, the patient survived 5 years or longer ,
* 81 patients of class 2, the patient died within 5 year.

# **3 Data Exploration** 

Data exploration is concerned with building a deeper understanding of your data. You try to understand how variables interact with each other, the distribution of the data, and whether there are outliers. To achieve this you mainly use descriptive statistics, visual techniques, plots and simple modeling.

In [None]:
import seaborn as sns
import matplotlib.pyplot as pl

# **Univariate Analysis**

In [None]:
dataset.plot()

In [None]:
dataset.hist()

# **Bivariate Analysis**

In [None]:
sns.set_style("darkgrid")
dataset.plot(kind="scatter", x="Age", y="Axillary nodes detected")

In [None]:
sns.FacetGrid(dataset, hue="Survival status", size=5)\
.map(pl.scatter, "Age", "Axillary nodes detected").add_legend();

## **Pair-Plots**

pair polts help us to know the corelations between the attributes among the data.

In [None]:
#plotting the dataset with 'Age' attribute
sns.pairplot(dataset, hue="Age", size=3, diag_kind="kde",\
             vars=['Age','Year operation','Axillary nodes detected'])

In [None]:
#plotting the dataset with 'Year operation' attribute
sns.pairplot(dataset, hue="Year operation", size=3, diag_kind="kde",\
             vars=['Age','Year operation','Axillary nodes detected'])

In [None]:
#plotting the dataset with Axillary node detected attribute
sns.pairplot(dataset, hue="Axillary nodes detected", size=3, diag_kind="kde",\
             vars=['Age','Year operation','Axillary nodes detected'])

In [None]:
#plotting the dataset with Survival attribute
sns.pairplot(dataset, hue="Survival status", size=3, diag_kind="kde",\
             vars=['Age','Year operation','Axillary nodes detected'])

## **Observations**

 From the above pair plots:
* In the first three pair plots when we plotted using **Age, Year operation, Axillary nodes detected ** attributes, we find there is no good insights in those plots.
* But in the fourth pairplot we can see **Survival status and Age** attributes distribution seems to be like a normal distribution. so look at the distribution more vividly.

In [None]:
sns.FacetGrid(dataset, hue="Survival status", size=6).map(sns.distplot, "Age").add_legend()

In [None]:
sns.boxplot(x='Survival status',y='Axillary nodes detected', data=dataset)

In [None]:
sns.violinplot(x='Survival status', y='Axillary nodes detected', data=dataset)

Since, the distributions looks normal for both *survived* and *not survived*, mean would be a right measure of data.

In [None]:
print("Mean age of patients survived:", round(np.mean(dataset[dataset['Survival status'] == 1]['Age'])))
print("Mean age of patients not survived:", round(np.mean(dataset[dataset['Survival status'] == 2]['Age'])))

In [None]:
sur = dataset[dataset['Survival status'] == 1]
sur.describe()

In [None]:
not_sur = dataset[dataset['Survival status'] == 2]
not_sur.describe()

## **Observations**

* The people who are **not survived** tend to have more average number of **Axillary nodes detected** and more spread out the distribution than **survived**.
* ***Axillary nodes detected*** is the useful features to indentify the survival status. since, the both distributions are way different from each other.
* There Mean age of the patients **not survived** is **54** and **survived** is **52** years.
* There are more number of people around **1965** year of operation than people around **1958** in not survived class, represents a bimodal distribution.

# **Multivariate Analysis**


## **Parallel coordinates**

In [None]:
from pandas.plotting import parallel_coordinates
parallel_coordinates(dataset, "Survival status",color=['g','y'])