In [28]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator,TransformerMixin
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# load data 

In [2]:
if not os.path.isfile("../data/indian_liver_patient_dataset.csv"):
    df = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/liver_patient_data/indian_liver_patient_dataset.csv")
    df.to_csv("../data/indian_liver_patient_dataset.csv")
else:
    df = pd.read_csv("../data/indian_liver_patient_dataset.csv")

# EDA

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
0,0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  500 non-null    int64  
 1   Age                         500 non-null    int64  
 2   Gender                      500 non-null    object 
 3   Total_Bilirubin             500 non-null    float64
 4   Direct_Bilirubin            500 non-null    float64
 5   Alkaline_Phosphotase        500 non-null    int64  
 6   Alamine_Aminotransferase    500 non-null    int64  
 7   Aspartate_Aminotransferase  500 non-null    int64  
 8   Total_Protiens              500 non-null    float64
 9   Albumin                     500 non-null    float64
 10  Albumin_and_Globulin_Ratio  496 non-null    float64
 11  Liver_Problem               500 non-null    int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 47.0+ KB


### we have missing values [?]

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,496.0,500.0
mean,249.5,44.586,2.6038,1.1172,296.372,82.736,108.82,6.4448,3.1688,0.960907,1.3
std,144.481833,16.5334,5.120238,2.066709,257.461676,194.366775,307.093557,1.08902,0.799741,0.294289,0.458717
min,0.0,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,124.75,32.75,0.8,0.2,175.0,23.0,24.75,5.7,2.6,0.795,1.0
50%,249.5,45.0,0.9,0.3,205.5,33.5,40.0,6.5,3.1,1.0,1.0
75%,374.25,58.0,2.2,1.0,298.0,59.25,79.25,7.2,3.8,1.1,2.0
max,499.0,85.0,75.0,14.2,2110.0,2000.0,4929.0,9.6,5.5,1.9,2.0


In [6]:
df.shape

(500, 12)

In [7]:
df['Liver_Problem'].value_counts()

Liver_Problem
1    350
2    150
Name: count, dtype: int64

In [8]:
df["Albumin_and_Globulin_Ratio"].isna().any()

True

In [11]:
df = df.drop("Unnamed: 0", axis=1)

# build pipeline

In [15]:
X = df.drop("Liver_Problem", axis=1)
y = df["Liver_Problem"]

In [18]:
ct = ColumnTransformer(
    [
        ("impute_missing_values_on_Albumin_and_Globulin_Ratio", SimpleImputer(missing_values=np.nan, strategy="mean"), ["Albumin_and_Globulin_Ratio"]),
        ("LabelEncoder_for_gender", OneHotEncoder(), ["Gender"])
    ], remainder="passthrough"
                    )

In [19]:
X = ct.fit_transform(X)

# Split data

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [21]:
model = LogisticRegression()

In [22]:
model.fit(X_train, y_train)

In [23]:
pred = model.predict(X_test)

# check model accuracy

In [24]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print("True Positive", tp)
print("True Negative", tn)
print("False Positive", fp)
print("False Negative", fn)

True Positive 7
True Negative 60
False Positive 6
False Negative 27


In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.67

# apply no never-seen data

In [48]:
df_test = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/liver_patient_data/indian_liver_patient_new_testdataset.csv')

In [49]:
df_test.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,36,Male,2.8,1.5,305,28,76,5.9,2.5,0.7
1,42,Male,0.8,0.2,127,29,30,4.9,2.7,1.2
2,53,Male,19.8,10.4,238,39,221,8.1,2.5,0.4
3,32,Male,30.5,17.1,218,39,79,5.5,2.7,0.9
4,32,Male,32.6,14.1,219,95,235,5.8,3.1,1.1


In [52]:
df_test_ran = ct.fit_transform(df_test)

In [53]:
test_pred = model.predict(df_test_ran)

In [54]:
test_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2])