In [1]:
%matplotlib notebook

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import scipy as scp
sns.set(style="white", color_codes=True)

### Loading the data

In [3]:
hepatitis_data = pd.read_csv("dataset_55_hepatitis.csv")

We can check the shape of our DataFrame to match the specifications provided for our dataset: 155 patients(rows), 19 features+1 class (columns)

In [4]:
hepatitis_data.shape

(155, 20)

### Exploratory Analysis

An important part of doing predictions with Machine Learning techniques is to perform Exploratory Data Analysis (EDA). This is useful for getting to know your data, looking at it from different perspectives, describing  and summarizing it without making any assumption in order to detect any potential problems.


In [5]:
hepatitis_data.head()

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,Class
0,30,male,no,no,no,no,no,no,no,no,no,no,no,1.0,85,18,4.0,?,no,LIVE
1,50,female,no,no,yes,no,no,no,no,no,no,no,no,0.9,135,42,3.5,?,no,LIVE
2,78,female,yes,no,yes,no,no,yes,no,no,no,no,no,0.7,96,32,4.0,?,no,LIVE
3,31,female,?,yes,no,no,no,yes,no,no,no,no,no,0.7,46,52,4.0,80,no,LIVE
4,34,female,yes,no,no,no,no,yes,no,no,no,no,no,1.0,?,200,4.0,?,no,LIVE


In [6]:
hepatitis_data.dtypes

AGE                 int64
SEX                object
STEROID            object
ANTIVIRALS         object
FATIGUE            object
MALAISE            object
ANOREXIA           object
LIVER_BIG          object
LIVER_FIRM         object
SPLEEN_PALPABLE    object
SPIDERS            object
ASCITES            object
VARICES            object
BILIRUBIN          object
ALK_PHOSPHATE      object
SGOT               object
ALBUMIN            object
PROTIME            object
HISTOLOGY          object
Class              object
dtype: object

Because for machine learning algorithms, it is requiered to have numerical data, we will convert categorical data that has values 'no', 'yes' to 0 and 1 respectively. Another important point to consider is to convert the binary survival variable `Class` encoded now as 'DIE', 'LIVE' to numerical categories (0 and 1, respectively).

In [7]:
replacements = {'no': 0,
               'yes': 1,
               'DIE': 0,
               'LIVE': 1,
               '?': np.nan,
               'female': 0,
               'male': 1}

hepatitis_data.replace(replacements, inplace = True)

In [8]:
hepatitis_data = hepatitis_data.astype(float)

In [9]:
total_of_patients = hepatitis_data.shape[0]
total_of_live_patients = (np.sum(hepatitis_data['Class'] == 1)/total_of_patients)*100
total_of_dead_patients = (np.sum(hepatitis_data['Class'] == 0)/total_of_patients)*100
print("Living patients:", round(total_of_live_patients,2),"%")
print("Dead patients:", round(total_of_dead_patients,2),"%")

Living patients: 79.35 %
Dead patients: 20.65 %


In [10]:
numerical_variables = ['AGE', 'BILIRUBIN', 'PROTIME', 'ALBUMIN', 'ALK_PHOSPHATE', 'SGOT']
hepatitis_data[numerical_variables].describe()

Unnamed: 0,AGE,BILIRUBIN,PROTIME,ALBUMIN,ALK_PHOSPHATE,SGOT
count,155.0,149.0,88.0,139.0,126.0,151.0
mean,41.2,1.427517,61.852273,3.817266,105.325397,85.89404
std,12.565878,1.212149,22.875244,0.651523,51.508109,89.65089
min,7.0,0.3,0.0,2.1,26.0,14.0
25%,32.0,0.7,46.0,3.4,74.25,31.5
50%,39.0,1.0,61.0,4.0,85.0,58.0
75%,50.0,1.5,76.25,4.2,132.25,100.5
max,78.0,8.0,100.0,6.4,295.0,648.0


In [11]:
categorical_variables = ['SEX' , 'STEROID', 'ANTIVIRALS', 'FATIGUE','MALAISE','ANOREXIA','LIVER_BIG','LIVER_FIRM','SPLEEN_PALPABLE',
'SPIDERS','ASCITES','VARICES', 'HISTOLOGY']
hepatitis_data[categorical_variables].apply(pd.Series.value_counts)

Unnamed: 0,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,HISTOLOGY
0.0,139,76,131,54,93,122,25,84,120,99,130,132,85
1.0,16,78,24,100,61,32,120,60,30,51,20,18,70


In [12]:
hepatitis_analysis = hepatitis_data.dropna()
interesting_values_x = ['ALBUMIN', 'ANOREXIA','ALK_PHOSPHATE', 'ASCITES', 'BILIRUBIN', 'PROTIME', 'SGOT', 'Class']
hepatitis_analysis = hepatitis_analysis[interesting_values_x]
hepatitis_analysis.head()

Unnamed: 0,ALBUMIN,ANOREXIA,ALK_PHOSPHATE,ASCITES,BILIRUBIN,PROTIME,SGOT,Class
5,4.0,0.0,95.0,0.0,0.9,75.0,28.0,1.0
10,4.4,0.0,78.0,0.0,1.3,85.0,30.0,1.0
11,3.7,0.0,59.0,0.0,1.0,54.0,249.0,1.0
12,3.9,0.0,81.0,0.0,0.9,52.0,60.0,1.0
13,4.9,0.0,57.0,0.0,2.2,78.0,144.0,1.0


In [13]:
plt.figure(figsize=(6,3.5))
plt.subplot(1, 2, 1)
sns.distplot(hepatitis_analysis['SGOT'],
             kde_kws={"color":"blue","lw":1.5,"alpha":0.8},
             hist_kws={"color":"green","alpha":0.3})
plt.subplot(1, 2, 2)
sns.distplot(hepatitis_analysis['ALK_PHOSPHATE'],
            kde_kws={"color":"red","lw":1.5,"alpha":0.8},
            hist_kws={"color":"pink","alpha":0.6})
sns.despine();

<IPython.core.display.Javascript object>

In [14]:
plt.figure(figsize=(7,3.5))
plt.subplot(1, 2, 1)
sns.distplot(hepatitis_analysis['BILIRUBIN'],
             kde_kws={"color":"green","lw":1.5,"alpha":0.8},
             hist_kws={"color":"lightblue","alpha":0.8})
sns.despine()
plt.subplot(1, 2, 2)
sns.distplot(hepatitis_analysis['ALBUMIN'], 
             kde_kws={"color":"red","lw":1.5,"alpha":0.8},
             hist_kws={"color":"orange","alpha":0.3})
sns.despine();

<IPython.core.display.Javascript object>

In [15]:
plt.figure(figsize=(6,3.5))
plt.subplot(1, 2, 1)
sns.distplot(hepatitis_analysis['PROTIME'], 
             kde_kws={"color":"orange","lw":1.5,"alpha":0.8},
             hist_kws={"color":"yellow","alpha":0.3})
sns.despine();

<IPython.core.display.Javascript object>

In [16]:
hepatitis_analysis[['ALBUMIN', 'ALK_PHOSPHATE', 'BILIRUBIN', 'SGOT']] = hepatitis_analysis[['ALBUMIN','ALK_PHOSPHATE', 'BILIRUBIN', 'SGOT']].applymap(np.log)
hepatitis_analysis.head()

Unnamed: 0,ALBUMIN,ANOREXIA,ALK_PHOSPHATE,ASCITES,BILIRUBIN,PROTIME,SGOT,Class
5,1.386294,0.0,4.553877,0.0,-0.105361,75.0,3.332205,1.0
10,1.481605,0.0,4.356709,0.0,0.262364,85.0,3.401197,1.0
11,1.308333,0.0,4.077537,0.0,0.0,54.0,5.517453,1.0
12,1.360977,0.0,4.394449,0.0,-0.105361,52.0,4.094345,1.0
13,1.589235,0.0,4.043051,0.0,0.788457,78.0,4.969813,1.0


In [17]:
g = sns.pairplot(hepatitis_analysis, x_vars = ['BILIRUBIN', 'PROTIME', 'ALBUMIN', 'ALK_PHOSPHATE', 'SGOT'], 
               y_vars = ['BILIRUBIN', 'PROTIME', 'ALBUMIN', 'ALK_PHOSPHATE', 'SGOT'], 
               hue = 'Class', 
               kind= 'scatter',
               palette = 'husl',
               size = 2,
               plot_kws={"s": 35, "alpha": 0.8})
g.fig.get_children()[-1].set_bbox_to_anchor((0.05, 0.9, 0.18, 0.1));


<IPython.core.display.Javascript object>

In [18]:
graph = sns.PairGrid(hepatitis_data,
                 x_vars=["ANOREXIA", "ASCITES"],
                 y_vars=['BILIRUBIN', 'PROTIME', 'ALBUMIN', 'ALK_PHOSPHATE', 'SGOT'],
                 hue = 'Class')
graph.map(sns.swarmplot, s = 6)
graph.add_legend(frameon=True, bbox_to_anchor=(0.33, 0.96));

<IPython.core.display.Javascript object>

In [19]:
hepatitis_data[categorical_variables] = hepatitis_data[categorical_variables].apply(lambda x : pd.factorize(x)[0])

In [20]:
corr = hepatitis_data.dropna().corr(method = 'pearson')

In [21]:
plt.figure(figsize=(7, 7))
cmap = sns.diverging_palette(240, 10, n=9, center = 'light')
sns.heatmap(corr, linewidths=.2, cmap = cmap, cbar_kws={"shrink": .8}, square=True, xticklabels=True, yticklabels=True);
plt.yticks(size= 8, rotation=0) 
plt.xticks(size = 8, rotation = 90);

<IPython.core.display.Javascript object>

We can infer that some of the variables show a coefficient of ~0.6 or -0.4, but most of them display a very low correlation coefficient. So we can conclude that there is no strong linear correlation between the variables.