# Data Preparation

The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.

Attribute Information
    
    Age = Age at which operation peformed
    Year of Operation = Which Year operation is performed
    Auxil_Nodes_det = Number of Auxiliary Nodes detected
    Survival_period = Number of years alive after operation

In [None]:
import os
os.getcwd()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import warnings
# Read datatset into framework
# Assigned column names as 

warnings.filterwarnings('ignore')
cancer_ds = pd.read_csv("haberman.csv", header = None, names = ['Age',
                          'Year_of_Operation', 'Auxil_Nodes_det', 'Survival_period'])


In [None]:
# Size of the dataset
print (cancer_ds.shape)
cancer_ds["Auxil_Nodes_det"].value_counts()

In [None]:
print(cancer_ds.head(5))

In [None]:
print(cancer_ds.columns)

In [None]:
print(cancer_ds.describe())

# Univariate Analysis

# 1. PDF

In [None]:
plt.close()
for idx, feature in enumerate(list(cancer_ds.columns)[0:3]):
    sns.FacetGrid(cancer_ds, hue="Survival_period", size=4).map(sns.distplot, feature).add_legend()
    plt.show()

Patients with age less than 20 years has more survival period
Patients between 45 to 60 has survived less than 5 years

# 2.CDF

In [None]:
count, bin_edges = np.histogram(cancer_ds['Age'], bins=10,density = True)
pdf = count / sum(count)
cdf = np.cumsum(pdf)
print("PDF of Age: ",pdf)
print(bin_edges)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.xlabel("Age")

# 2-D Scatter Plot

In [None]:
cancer_ds.plot(kind = 'scatter', x = 'Age', y = 'Year_of_Operation')
plt.show()

In [None]:
# Multi-variate Analysis
sns.set_style("whitegrid");
sns.FacetGrid(cancer_ds, hue="Survival_period", size=4) \
   .map(plt.scatter, "Age", "Year_of_Operation") \
   .add_legend();
plt.show();

80% of peple who have done operation are between 35 to 70 years of age

# Pair Plots

In [None]:
plt.close()
sns.set_style("whitegrid");
sns.pairplot(cancer_ds, hue="Survival_period", vars=['Age', 'Year_of_Operation', 'Auxil_Nodes_det'], size=5);
plt.show();

# Box Plot

In [None]:
plt.figure(1)
plt.subplot(161)
sns.boxplot(x="Survival_period", y="Age",data = cancer_ds)
plt.subplot(163)
sns.boxplot(x="Survival_period", y="Year_of_Operation",data = cancer_ds)
plt.subplot(165)
sns.boxplot(x="Survival_period", y="Auxil_Nodes_det",data = cancer_ds)
plt.show()

# Violin Plot

In [None]:
sns.set_style('darkgrid')
plt.figure(1)
plt.subplot(161)
sns.violinplot(x='Survival_period',y='Age',data=cancer_ds)
#plt.show()
#Violin Plot using Patients operation year.
plt.subplot(163)
sns.violinplot(x='Survival_period',y='Year_of_Operation',data=cancer_ds)
#plt.show()
#Violin Plot using no. of positive axillary nodes.
plt.subplot(165)
sns.violinplot(x='Survival_period',y='Auxil_Nodes_det',data=cancer_ds)
plt.show()

Persons with more Lymph nodes has very less chances of survival

# Multivariate Probability density, Contour Plot

In [None]:
sns.jointplot(x="Auxil_Nodes_det", y="Age", data=cancer_ds, kind="kde");
plt.show();

Persons between 45 to 65 years has lymph node 0 to 2

# Final Conclusion

The given dataset is not linearly seperable where there is no equal number of data-points for each class

There is high overlapping in the data-points and hence it is very diffucult to classify.
