In [None]:


import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
dataset=pd.read_csv("/kaggle/input/habermans-survival-data-set/haberman.csv")
dataset.head()

**Since the coloumn values are numerical, we change it into string values which easy to compute**

In [None]:
dataset.columns=["Age","Oper_Year","Axil_node","Surviv_stat"]
dataset.head()

In [None]:
dataset.shape

In [None]:
survival_status={1:1,2:0}
dataset['Surviv_stat']=[survival_status[x] for x in dataset['Surviv_stat']]

In [None]:
dataset['Surviv_stat'].value_counts()

**Confirming is there any missing value or not .**

In [None]:
sns.heatmap(dataset.isnull(),yticklabels=False)

In [None]:
dataset.isnull().sum()

In [None]:
dataset.info()

In [None]:
dataset.describe()

**Counting the discrete values in columns**

In [None]:
discrete=[]
for feature in dataset.columns:
    unique,counts= np.unique(dataset[feature],return_counts=True)
    total=dict(zip(unique,counts))
    for key,values in total.items():
        if values> 10:
            discrete.append([feature,[key,values]])
            
            
discrete

**UNIVARIATE ANALYSIS**

In [None]:
for features in dataset.columns:
    sns.set_style("whitegrid");
    sns.FacetGrid(dataset,hue="Surviv_stat",height=6)\
    .map(sns.distplot,features)\
    .add_legend();

    plt.title(features, fontsize=17)
    plt.show();

* Age 40 to 58 have more non survival counts while age 40 to 60 have more survival counts
* Operation year 57.5 to 66 have more non-survival count while 57.5 to 67.5 have survival count
* Axil node value 0 have more survival count

In [None]:
for features in dataset.columns:
    sns.boxplot(x="Surviv_stat",y=features,data=dataset)
    plt.title("Box-plot for survival stat and "+features)
    plt.show()

* Inter quatile range of non-survived age lie between 48 to 62 while survived age lie between 42 to 60 .
* Inter quatile range of non-survived Axil node lie between 59 to 65 while survived lie between 60 to 66
* Axil node feature contain so much of outlier

**We can use Violin plot as the combination of box plot and distribution plot**

In [None]:
sns.violinplot(x="Surviv_stat",y="Age",data=dataset)
plt.show()

**BIVARIATE ANALYSIS**

In [None]:
sns.set_style("whitegrid");
sns.FacetGrid(dataset,hue="Surviv_stat",height=5).map(plt.scatter,"Age","Axil_node").add_legend();
plt.show();

In [None]:

sns.set_style("whitegrid");
sns.pairplot(dataset,hue="Surviv_stat",height=3);
plt.show();

* Features are not linearly seperable

**Quantiles**

In [None]:
print("\nQuantiles:")
for feature in dataset.columns:
    quant=np.percentile(dataset[feature],np.arange(0, 100, 25))
    print("Quantiles of {} is {}".format(feature,quant))
    IQr=quant[3]-quant[1]
    print("Inter quantile range of {} is {}".format(feature,IQr))
    print("\n")

**Contour Plot**

In [None]:
sns.jointplot(x="Age",y="Axil_node",data=dataset, kind="kde")
plt.show()

# OBSERVATIONS:

* Given dataset is a binary classification problem and task is find wheather the corresponding feature person survived after 5 years or not.
* Firsly prepoccesed the dataset in which change the numerical feature name in to appropriate string.
* Dataset doent have any missing values.
* For numerical feature it contain discreate values which have dependency to target values.
* Age 40 to 58 have more non survival counts while age 40 to 60 have more survival counts.
* Operation year 57.5 to 66 have more non-survival count while 57.5 to 67.5 have survival count.
* Axil node value 0 have more survival count.
* Axil node feature contain more outlier than other features
* Inter quantile range of Age is 17.0
* Inter quantile range of Operation year is 6.0
* Inter quantile range of Axil node is 4.0
* Given dataset is not linearly seperable
