In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

# Reading the Dataset

In [None]:
df = pd.read_csv("/kaggle/input/hepatitis-data/hepatitis_csv.csv")
df.head()

Listing the columns provided in the dataset

In [None]:
df.columns

Getting the shape => (rows, cols) of the dataset

In [None]:
df.shape

Getting some information for each of the columns in the dataset

In [None]:
df.info()

Describing each of the columns of the dataset by values such as count, mean, standard deviation, minimum value,maximum value, Quarter Percentile, Half Percentile and 75% Percentile

In [None]:
df.describe()

See how many null values(NA) values are there in each of the columns of the dataset

In [None]:
df.isnull().sum()

# Seeing the Correlation(statistical relationship) among the various features of the dataset. 

A positive correlation means a variable increases / decreases as other other variable increases / decreases respectively. Negative Correlation means a variable increases / decreases as the other variable decreases / increases respectively

In [None]:
df.corr()

Dropping the 'protime' column in the dataframe as it contains a lot of null values and is of no use in the EDA

In [None]:
df.drop("protime", axis = 1, inplace=True)
df.shape

Creating a dataframe which only contains numeric data of the main dataset

In [None]:
numeric_data = df._get_numeric_data()
numeric_data.head()

Dropping the columns 'antivirals' and 'histology' as they are not numeric data

In [None]:
numeric_data.drop('antivirals', axis=1, inplace=True)
numeric_data.drop('histology', axis=1, inplace=True)

In [None]:
numeric_data.info()

# Creating a Correlation Plot to understand the correlation among various features in a better way visually

Using Seaborn Library

In [None]:
sns.set(style = "ticks", context = "talk")
plt.figure(figsize=(20, 12))
sns.heatmap(df.corr(), annot=True, cmap='viridis')

The correlation plot here shows us that features 'bilirubin' and 'alk_phosphate' both have higher positive correlation with 'histology' and features 'bilirubin' and 'alk_phosphate' both hae higher negative correlation with 'albumin'

Using plotly.express library

In [None]:
x = list(df.corr().columns)
y = list(df.corr().index)
values = np.array(df.corr().values)
fig = go.Figure(data=go.Heatmap(
    x = x,
    y = y,
    z = values,
    hoverongaps = False
))
fig.show()

Creating a displot using seaborn library

distplot(now deprecated to 'displot') => combines the matplotlib hist function with the seaborn kdeplot() and rugplot() functions

In [None]:
plt.figure(figsize=(20, 10))
sns.displot(df.age, bins=40)

This shows that most of the people in the dataset have age between 25-50(approx.) with maximum people having age = 36-38 and 50-55. Only a few people with age<=30 and age>=62 suffer from Hepatitis

Creating a KDE Plot using seaborn library

KDE plot - Kernel Density Estimation Plot - visualizes the distribution of obserations in a dataset

In [None]:
sns.kdeplot(df.age)

Similar to the displot, the kde plot shows a bit more clearer distribution of age and we can clearly see that the average age of people suffering from Hepatitis lies in the range 35-40

In [None]:
px.histogram(df.age)

Various Plots for Checking the Bilirubin distribution for various integer values

In [None]:
plt.figure(figsize=(20, 10))
sns.displot(df.bilirubin, bins=40)

In [None]:
sns.kdeplot(df.bilirubin)

In [None]:
px.histogram(df.bilirubin)

All these plots indicate that people with lower bilirubin (values between 0-2) are more prone to Hepatitis

In [None]:
plt.figure(figsize=(20, 10))
sns.displot(df.sgot, bins=40)

In [None]:
sns.kdeplot(df.sgot)

In [None]:
px.histogram(df.sgot)

In [None]:
plt.figure(figsize=(20, 10))
sns.displot(df.alk_phosphate, bins=40)

In [None]:
sns.kdeplot(df.alk_phosphate)

In [None]:
px.histogram(df.alk_phosphate)

This implies that Maximum people have alkaline phosphate value of 80-100

# Comparing the Gender Distribution in the given Dataset

Bar Chart for the Gender Distribution using matplotlib

In [None]:
plt.figure(figsize=(20, 10))
df['sex'].value_counts().plot(kind="bar", color='blue', title='Gender Distribution')

Piechart for the gender Distribution

In [None]:
px.pie(df['sex'], labels = df['sex'].value_counts().index,
       values = df['sex'].value_counts().values,
       names = df['sex'].value_counts().index,
       title = 'Gender Distribution in the Data'
      )

Histogram plot for the gender distribution using plotly.express

In [None]:
px.histogram(df.sex)

This implies that most of the people in the given dataset are female and Females are more prone to Hepatitis as compared to Males

In [None]:
df['class'].value_counts()

In [None]:
px.histogram(df['class'])

In [None]:
px.pie(df['class'], labels = df['class'].value_counts().index,
       values = df['class'].value_counts().values,
       names = df['class'].value_counts().index,
       title = 'Distribution of Deaths vs Alive in the Data'
      )

# Making Histograms for each of the features in the dataset

In [None]:
from plotly.subplots import make_subplots
features = ['age', 'sex', 'steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia',
       'liver_big', 'liver_firm', 'spleen_palpable', 'spiders', 'ascites',
       'varices', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin',
       'histology', 'class']
rows = 3
cols = 5
fig = make_subplots(rows=rows, cols=cols, subplot_titles=features)
x, y = np.meshgrid(np.arange(rows)+1, np.arange(cols)+1)
count  = 0
for row, col in zip(x.T.reshape(-1), y.T.reshape(-1)):
    fig.add_trace(
        go.Histogram(x = df[features[count]].values),
        row = row,
        col = col
    )
    count+=1
    
fig.update_layout(height=900, width=900, title_text='Feature Distribution', showlegend=False)
fig.show()

# Making Boxplots for each of the features in the dataset

In [None]:
box_cols = ['age', 'sex', 'steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia',
       'liver_big', 'liver_firm', 'spleen_palpable', 'spiders', 'ascites',
       'varices', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin',
       'histology', 'class']
rows = 3
cols = 5
fig = make_subplots(rows=rows, cols=cols, subplot_titles=box_cols)
x, y = np.meshgrid(np.arange(rows)+1, np.arange(cols)+1)
count = 0
for row, col in zip(x.T.reshape(-1), y.T.reshape(-1)):
    try:
        fig.add_trace(
            go.Box(x = df[box_cols[count]].values, name=''),
            row = row,
            col = col
        )
        count+=1
    except:
        break
    
fig.update_layout(height=900, width=900, title_text='Boxplot Distribution', showlegend=False)
fig.show()