In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/habermans-survival-data-set/haberman.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().any()

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]


In [None]:
df.drop_duplicates(inplace=True)
df

In [None]:
df.describe()

In [None]:
df.rename(columns = {'30': 'Age', '64': 'OP_Year', '1': 'axil_nodes', '1.1': 'Surv_Status'}, inplace=True)

In [None]:
df.dtypes

In [None]:
df

In [None]:
df['Surv_Status'].value_counts()

In [None]:
(df['Surv_Status'].value_counts())/(len(df))

In [None]:
#from above, we can see this is a very imbalanced dataset, with 1 dominate the target variable.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.pairplot(df, hue='Surv_Status')

In [None]:
#from pairplot, we can see that not one particular feature that can use to easily split the target variable apart

In [None]:
sns.FacetGrid(df, hue="Surv_Status", size = 5) \
   .map(sns.histplot, "Age", kde = True) \
   .add_legend();
plt.show();

In [None]:
#from this graph, it is hard to split survive status based on Age.

In [None]:
sns.displot(df['Age'])

In [None]:
#From above graph, the minimal and max is 30, and 80. The median around 52.


In [None]:
sns.displot(df[df['Surv_Status'] == 1]['Age'])

In [None]:
sns.displot(df[df['Surv_Status'] == 2]['Age'], color='orange')

In [None]:
sns.FacetGrid(df, hue="Surv_Status", height = 5) \
   .map(sns.histplot, "OP_Year", kde = True) \
   .add_legend();
plt.show();

In [None]:
sns.histplot(df['OP_Year'])

In [None]:
#the min for OP_Year is 58. Max is 68. Median is about 63.
#does not look like OP_Year can provide us much information either.

In [None]:
sns.FacetGrid(df, hue="Surv_Status", height = 5) \
   .map(sns.histplot, "axil_nodes", kde = True) \
   .add_legend();
plt.show();

In [None]:
sns.histplot(df['axil_nodes'])

In [None]:
#min is 0, max is 50.
#the less the axil nodes detected, the higher chance this patient will survive more than 5 years

In [None]:
df[df['axil_nodes'] < 0]

In [None]:
df['axil_nodes'].value_counts()

In [None]:
df.groupby(['OP_Year', 'Surv_Status']).agg({'Surv_Status':'count'})

In [None]:
df.groupby(['OP_Year', 'Surv_Status']).agg({'Surv_Status':'count'})/len(df)

In [None]:
#if patient has the operation in year of 1958, 1959, 1960, 1961, 1963, 1964, his survive chance is higher.

In [None]:
df_surv = df.loc[df["Surv_Status"] == 1];
df_unsurv = df.loc[df["Surv_Status"] == 2];

In [None]:
counts, bin_edges = np.histogram(df_surv['Age'], bins=8, density = True)
pdf = counts/(sum(counts))
print(pdf)
print(bin_edges)
cdf = np.cumsum(pdf)
plt.title("CDF plot for detcteted age.")
plt.plot(bin_edges[1:],pdf,label="PDF survived")
plt.plot(bin_edges[1:], cdf,label="CDF survived")
plt.legend() 



In [None]:
counts, bin_edges = np.histogram(df_unsurv['Age'], bins=8, density = True)
pdf = counts/(sum(counts))
print(pdf)
print(bin_edges)
cdf = np.cumsum(pdf)
plt.title("CDF plot for detcteted age.")
plt.plot(bin_edges[1:],pdf,label="PDF unsurvived")
plt.plot(bin_edges[1:], cdf,label="CDF unsurvived")
plt.legend() 

In [None]:
#From the above graph, it looks like people in the age range of 50-60 has higher survivial rate, people around age 46 has the highest unsurvivial chance.

In [None]:
counts, bin_edges = np.histogram(df_surv['OP_Year'], bins = 12, density = True)
pdf = counts/(sum(counts))
print(pdf)
print(bin_edges)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

In [None]:
counts, bin_edges = np.histogram(df_unsurv['OP_Year'], bins = 12, density = True)
pdf = counts/(sum(counts))
print(pdf)
print(bin_edges)

cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

In [None]:
#looks like both number of survial and unsurvial rate is high for people doing operation from 1958 to 1960. unsurvial rate is also high for people doing operation aroun 1965.

In [None]:
counts, bin_edges = np.histogram(df_surv['axil_nodes'], bins = 8, density = True)
pdf = counts/(sum(counts))
print(pdf)
print(bin_edges)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

In [None]:
#from above CDF, we can observe that the survival rate is about 82% if a patient only has <5 axil nodes. As the number of nodes increase, his survival rate decrease.

In [None]:
counts, bin_edges = np.histogram(df_unsurv['axil_nodes'], bins = 10, density = True)
pdf = counts/(sum(counts))
print(pdf)
print(bin_edges)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:], pdf)
plt.plot(bin_edges[1:], cdf)

In [None]:
#the unsurvival rate is highest when nodes is at the smallest as well. This means that using number of nodes cant detect survival rate.

In [None]:
print("\nMedians Age for surivial and unsurvival:")
print(np.median(df_surv["Age"]))
print(np.median(df_unsurv["Age"]))

print("\nMedians number of nodes for surivial and unsurvival:")
print(np.median(df_surv["axil_nodes"]))
print(np.median(df_unsurv["axil_nodes"]))


In [None]:
print("\nQuantiles Age for surivial and unsurvival:")
print(np.percentile(df_surv["Age"], np.arange(0, 100, 25)))
print(np.percentile(df_unsurv["Age"], np.arange(0, 100, 25)))

print("\nQuantiles for number of nodes for surivial and unsurvival:")
print(np.percentile(df_surv["axil_nodes"], np.arange(0, 100, 25)))
print(np.percentile(df_unsurv["axil_nodes"], np.arange(0, 100, 25)))

In [None]:
sns.boxplot(x='Surv_Status',y='Age', data=df)
plt.show()

In [None]:
#not only the age for unsurvival patient is slidely older than survivial ones, even the majority people who do not survive more than 5 years after the operation is older.

In [None]:
sns.boxplot(x='Surv_Status',y='OP_Year', data=df)
plt.show()

In [None]:
#IQR for survivial is 60 to 66, while IQR for unsurvival is 59 to 65.

In [None]:
sns.boxplot(x='Surv_Status',y='axil_nodes', data=df)
plt.show()

#From above graph, we can see the axil_nodes feature is very positive skewed, especially for survival patients.

In [None]:
sns.violinplot(x='Surv_Status', y='Age',data=df)

In [None]:
sns.violinplot(x='Surv_Status', y='OP_Year',data=df)

In [None]:
sns.violinplot(x='Surv_Status', y='axil_nodes',data=df)

In [None]:
sns.jointplot(x="Age", y="OP_Year", data=df_surv, kind="kde");
plt.show();

In [None]:
sns.jointplot(x="Age", y="axil_nodes", data=df_surv, kind="kde");
plt.show();

In [None]:
sns.jointplot(x="OP_Year", y="axil_nodes", data=df_surv, kind="kde");
plt.show();