# Basic Terminology

# 1.Plotting for Exploratory data analysis (EDA)

# haberman dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#Load haberman.csv into a pandas dataFrame.

haberman = pd.read_csv(r'../input/haberman.csv')
haberman.head()

In [None]:
# Reference: Rename columns - https://www.kaggle.com/premvardhan/exploratory-data-analysis-haberman-s-survival
# To rename the column name for better understanding
# Both of the ways we can rename the columns name

#df.columns = ["Age", "Op_Year", "axil_nodes", "Surv_status"]

#or

haberman = haberman.rename(columns = {"30" : "Age", "64" : "Op_Year", "1" : "axil_nodes", "1.1" : "Surv_status"})

In [None]:
# (Q) how many data-points and features?
print (haberman.shape)

In [None]:
#(Q) What are the column names in our dataset?
print (haberman.columns)

In [None]:
haberman["Surv_status"].value_counts()

In [None]:
# observation: https://www.kaggle.com/gokulkarthik/haberman-s-survival-exploratory-data-analysis

haberman.info()


Observations:
There are no missing values in this dataset. So there is no need to do data imputation.
The datatype of 'Surv_status' column is integer. It has to be converted to categorical datatype.
The values of 'Surv_status' column are not meanigful. Hence they are mapped to 'yes' (survived after 5 years) and 'no' (not survived after 5 years)

In [None]:
# print the unique valuesof the target column
print(list(haberman['Surv_status'].unique()))

In [None]:
# Reference: https://www.kaggle.com/gokulkarthik/haberman-s-survival-exploratory-data-analysis
# modify the target column values to be meaningful as well as categorical
haberman['Surv_status'] = haberman['Surv_status'].map({1:"yes", 2:"no"})
haberman['Surv_status'] = haberman['Surv_status'].astype('category')
print(haberman.head())

In [None]:
print(haberman.info())

In [None]:
haberman.describe()

# 1-D Scatter Plot

In [None]:
#reference: http://zerosnones.net/histogram-read-understand-build/

one = haberman.loc[haberman["Surv_status"] == "yes"]
two = haberman.loc[haberman["Surv_status"] == "no"]
plt.plot(one["Age"], np.zeros_like(one["Age"]), 'o', label = "Surv_status\n" "yes")
plt.plot(two["Age"], np.zeros_like(two["Age"]), 'o', label = "no")
plt.title("1-D scatter plot for age")
plt.xlabel("Age")
plt.legend()
plt.show()

Observation: In a 1D scatter plot, we can see the endpoints of our feature. But it does not tell us how many data points we have in each interval, thereby giving us incomplete information. However, we can recognize that, many person who died had their age between 41-70. 

# 2-D Scatter Plot

In [None]:
#2-D scatter plot:
haberman.plot(kind='scatter', x='Age', y='Op_Year') ;
plt.title("2-D scatter plot for Age vs Op_Year")
plt.show()

# age vs axils plots

In [None]:
# 2-D Scatter plot with color-coding
sns.set_style("whitegrid");
sns.FacetGrid(haberman, hue="Surv_status", height=4).map(plt.scatter, "Age", "axil_nodes").add_legend();
plt.title("2-D scatter plot for Age vs axil_nodes")
plt.show();

Observation: In the above 2d scatter plot class label(i.e. a person died or survived) is not linearly seprable
0-5 axillary_nodes person survived and died as well but the died ratio is less than survive ratio.

# age vs operation year plots

In [None]:
# 2-D Scatter plot with color-coding
# Here 'sns' corresponds to seaborn. 
sns.set_style("whitegrid");
sns.FacetGrid(haberman, hue="Surv_status", height=4).map(plt.scatter, "Age", "Op_Year").add_legend();
plt.title("2-D scatter plot for Age vs Op_Year")
plt.show();

Observation: Here yes and no overlapped using Age and Axillary nodes dect features so the classifiaction between both is not possible.

# Pair-plot

# age vs operation year vs axil nodes plots

In [None]:

sns.set_style("whitegrid");
sns.pairplot(haberman, hue="Surv_status", vars = ["Age", "Op_Year", "axil_nodes"], height=4);
plt.show()

# NOTE: the diagnol elements are PDFs for each feature. PDFs are expalined below.

Observations:
By scattering the data points between Op_Year and axil_nodes, we can see the better seperation between the two clases than other scatter plots.

# Histogram

In [None]:
sns.FacetGrid(haberman, hue='Surv_status', height=4) \
    .map(sns.distplot, 'Age') \
    .add_legend()
plt.title("Histogram of age")

In [None]:
# plotting one dimensionally
sns.FacetGrid(haberman, hue='Surv_status', height=4) \
    .map(sns.distplot, 'Op_Year') \
    .add_legend()
plt.title("Histogram of operation_year")

In [None]:
# plotting one dimensionally
sns.FacetGrid(haberman, hue='Surv_status', height=4) \
    .map(sns.distplot, 'axil_nodes') \
    .add_legend()
plt.title("Histogram of axillary_lymph_node")

observations

In all the plots the features are overlapping each other massively. But somehow we can say
probabily 58% people survived who had 0-5 axlillary_lymph_node and 12% died as well.

# PDF, CDF

In [None]:
## CDF
haberman_status = haberman.loc[haberman["Surv_status"]=="yes"]
counts, bin_edges = np.histogram(haberman_status['axil_nodes'], bins=10, density=True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend('survivalstatus')
plt.legend(['Survived_PDF', 'Survived_CDF'])
plt.xlabel("Age of Survived")
plt.title("pdf and cdf for age of survived")
plt.show()

Observation:
From the above graph we can say that 100% of the "yes" had less than 40 axillary nodes dect and 4% have less than 10 auxillary nodes dect.

In [None]:

haberman_dead = haberman.loc[haberman['Surv_status']=="no"]
counts, bin_edges = np.histogram(haberman_dead['axil_nodes'], bins=10, density=True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend('survivalstatus')
plt.legend(['Survived_PDF', 'Survived_CDF'])
plt.xlabel("Age of dead")
plt.title("pdf and cdf for age of people who died")
plt.show()

In [None]:
#Reference: https://www.kaggle.com/vj1998/haberman-s-survival-exploratory-data-analysis
counts, bin_edges = np.histogram(haberman_dead['axil_nodes'], bins=10, density=True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)


counts, bin_edges = np.histogram(haberman['axil_nodes'], bins=10, density=True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)
plt.legend('Surv_status')
plt.legend(['Survived_PDF', 'Survived_CDF','Died_PDF', 'Died_CDF'])
plt.xlabel("Age of dead")
plt.title("pdf and cdf for age of survived and died")
plt.show()

Observation:
The probablity distribution function(PDF) for the people who survived and those who died different in case of axillary nodes.Hence, axillary nodes is the most important feature to predict the survival status after 5 years.
The survival rate is extremely high for patients having less than 3 axillary nodes dect.

# Box plot and Whiskers

In [None]:
sns.boxplot(x = "Surv_status", y = "Age", hue = "Surv_status", data = haberman).set_title("Box plot for survival_status and age")
plt.show()

In [None]:
sns.boxplot(x = "Surv_status", y = "Op_Year", hue = "Surv_status", data = haberman).set_title("Box plot for survival_status and age")
plt.show()

In [None]:
sns.boxplot(x = "Surv_status", y = "axil_nodes", hue = "Surv_status", data = haberman).set_title("Box plot for survival_status and age")
plt.show()

Observation:
Boxplot of "yes" had only 75th percentile value at axillary nodes dect is "no" and the 25th and 50th percentiles are overlapped.
Boxplot of "no" had only 25th percentile value at axillary nodes dect is "yes", 50th percentile value at axillary nodes dectis 3 and 75th percentile value at axillary nodes dect is 11.

# Violin plots

In [None]:
# A violin plot combines the benefits of the previous two plots 
#and simplifies them

# Denser regions of the data are fatter, and sparser ones thinner 
#in a violin plot
sns.violinplot(x="Surv_status", y="Age", data=haberman, size=4)
plt.title("Violin plot for survival_status and age")
plt.show()

#  Summarizing plots in english
* Exaplain your findings/conclusions in plain english
* Never forget your objective (the probelm you are solving) . Perform all of your EDA aligned with your objectives.

# Univariate, bivariate and multivariate analysis.

In [None]:
#5.1 Distribution plots
"""
* Distribution plots are used to visually assess how the data points are distributed with respect to its frequency.
* Usually the data points are grouped into bins and the height of the bars representing each group increases with increase in the number of data points 
lie within that group. (histogram)
* Probality Density Function (PDF) is the probabilty that the variable takes a value x. (smoothed version of the histogram)
* Kernel Density Estimate (KDE) is the way to estimate the PDF. The area under the KDE curve is 1.
* Here the height of the bar denotes the percentage of data points under the corresponding group
"""
for idx, feature in enumerate(list(haberman.columns)[:-1]):
    fg = sns.FacetGrid(haberman, hue='Surv_status', height=5)
    fg.map(sns.distplot, feature).add_legend()
    plt.show()


observations

In all the plots the features are overlapping each other massively. But somehow we can say probabily 58% people survived who had 0-5 axlillary_lymph_node and 12% died as well.

# Multivariate Analysis

In [None]:
sns.pairplot(haberman, hue="Surv_status", vars = ["Age", "Op_Year", "axil_nodes"], height=4);
plt.show()


Observations: By scattering the data points between Op_Year and axil_nodes, we can see the better seperation between the two clases than other scatter plots.

# Contour plot.


In [None]:
#2D Density plot, contors-plot
sns.jointplot(x="Age", y="Op_Year", data=haberman, kind="kde");
plt.show();


# Conclusion: 
By plotting all pdf, cdf, box-plot, pair plots, scatter plot etc. we get only one conclusion :
if number of axillary node is less,than survival of patients is more.
We need more features to comes on very good conlusion.

# Refernce:
1. https://www.kaggle.com/gokulkarthik/haberman-s-survival-exploratory-data-analysis#6.-Multivariate-Analysis
2. https://www.kaggle.com/vj1998/haberman-s-survival-exploratory-data-analysis
3. https://www.kaggle.com/premvardhan/exploratory-data-analysis-haberman-s-survival#notebook-container
4. https://www.kaggle.com/gokulkarthik/haberman-s-survival-exploratory-data-analysis
4. Assignment-1: Data Visualization with Haberman Dataset - AppliedAICourse
