In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [None]:
file_path = '/content/drive/My Drive/liver_dataset.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [None]:
df.tail()

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.1,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.0,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.0,1
582,38,Male,1.0,0.3,216,21,24,7.3,4.4,1.5,2


In [None]:
print(df.columns)

Index(['age', 'gender', 'tot_bilirubin', 'direct_bilirubin', 'tot_proteins',
       'albumin', 'ag_ratio', 'sgpt', 'sgot', 'alkphos', 'is_patient'],
      dtype='object')


In [None]:
df.shape

(583, 11)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               583 non-null    int64  
 1   gender            583 non-null    object 
 2   tot_bilirubin     583 non-null    float64
 3   direct_bilirubin  583 non-null    float64
 4   tot_proteins      583 non-null    int64  
 5   albumin           583 non-null    int64  
 6   ag_ratio          583 non-null    int64  
 7   sgpt              583 non-null    float64
 8   sgot              583 non-null    float64
 9   alkphos           579 non-null    float64
 10  is_patient        583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
tot_bilirubin,0
direct_bilirubin,0
tot_proteins,0
albumin,0
ag_ratio,0
sgpt,0
sgot,0
alkphos,4


In [None]:
df.describe()

Unnamed: 0,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,579.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947064,1.286449
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.319592,0.45249
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [None]:
df['is_patient'].value_counts()

Unnamed: 0_level_0,count
is_patient,Unnamed: 1_level_1
1,416
2,167


In [None]:
X = df.drop('is_patient', axis=1)
y = df['is_patient']

In [None]:
print(X)

     age  gender  tot_bilirubin  direct_bilirubin  tot_proteins  albumin  \
0     65  Female            0.7               0.1           187       16   
1     62    Male           10.9               5.5           699       64   
2     62    Male            7.3               4.1           490       60   
3     58    Male            1.0               0.4           182       14   
4     72    Male            3.9               2.0           195       27   
..   ...     ...            ...               ...           ...      ...   
578   60    Male            0.5               0.1           500       20   
579   40    Male            0.6               0.1            98       35   
580   52    Male            0.8               0.2           245       48   
581   31    Male            1.3               0.5           184       29   
582   38    Male            1.0               0.3           216       21   

     ag_ratio  sgpt  sgot  alkphos  
0          18   6.8   3.3     0.90  
1         100

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(583, 10) (466, 10) (117, 10)


In [None]:
y_test.value_counts()

Unnamed: 0_level_0,count
is_patient,Unnamed: 1_level_1
1,78
2,39


In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

ct = ColumnTransformer(
   [('num', num_pipeline, ['age', 'tot_bilirubin', 'direct_bilirubin', 'alkphos', 'sgpt', 'sgot', 'tot_proteins', 'albumin', 'ag_ratio']),
     ('encoder', OneHotEncoder(), ['gender'])],
    remainder='passthrough'
)
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(y_pred)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1
 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1
 2 1 1 1 1 1]


In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.7424892703862661


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.7008547008547008


In [None]:
import numpy as np
# Convert the input symptoms to a DataFrame with the appropriate columns
# Create a dictionary with the required columns and get input values for each
input_data = (45,2.1,3.3,4.1,2.1,3.2,345,45,12,'Female')
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# Create the DataFrame using the collected input data
input_data_features = ct.transform(input_df)

#predict probablity of model
prediction_probability = model.predict_proba(input_data_features)
print(f"Probability of no liver disease: {prediction_probability[0][0]}")
print(f"Probability of liver disease: {prediction_probability[0][1]}")

# making prediction
prediction = model.predict(input_data_features)
if (prediction[0]==1):
  print('no risk of liver disease')

else:
  print('At risk of liver disease')

Probability of no liver disease: 0.7773007407587973
Probability of liver disease: 0.22269925924120273
no risk of liver disease
