In [1]:
import sklearn
import pandas as pd

df = pd.read_csv("liver.csv")
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,2221,C,Placebo,18499,F,N,Y,N,N,0.5,149.0,4.04,227.0,598.0,52.7,57.0,256.0,9.9,1
1,1230,C,Placebo,19724,M,Y,N,Y,N,0.5,219.0,3.93,22.0,663.0,45.0,75.0,220.0,10.8,2
2,4184,C,Placebo,11839,F,N,N,N,N,0.5,320.0,3.54,51.0,1243.0,122.45,80.0,225.0,10.0,2
3,2090,D,Placebo,16467,F,N,N,N,N,0.7,255.0,3.74,23.0,1024.0,77.5,58.0,151.0,10.2,2
4,2105,D,Placebo,21699,F,N,Y,N,N,1.9,486.0,3.54,74.0,1052.0,108.5,109.0,151.0,11.5,1


In [2]:
from sklearn.preprocessing import OrdinalEncoder, normalize
from sklearn.model_selection import train_test_split

pd.set_option('future.no_silent_downcasting', True)

# Split the data into features (X) and target (y)
X = df.drop(['Stage'], axis=1)
enc_columns = ['Status', 'Sex', 'Drug', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
enc_categories = [
    ['C', 'CL', 'D'],
    ['M', 'F'],
    ['Placebo', 'D-penicillamine'],
    ['N', 'Y'],
    ['N', 'Y'],
    ['N', 'Y'],
    ['N', 'S', 'Y'],
]
encoder = OrdinalEncoder(categories=enc_categories)
X[enc_columns] = encoder.fit_transform(X[enc_columns])
X

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin
0,2221,0.0,0.0,18499,1.0,0.0,1.0,0.0,0.0,0.5,149.000000,4.04,227.0,598.0,52.70,57.000000,256.0,9.9
1,1230,0.0,0.0,19724,0.0,1.0,0.0,1.0,0.0,0.5,219.000000,3.93,22.0,663.0,45.00,75.000000,220.0,10.8
2,4184,0.0,0.0,11839,1.0,0.0,0.0,0.0,0.0,0.5,320.000000,3.54,51.0,1243.0,122.45,80.000000,225.0,10.0
3,2090,2.0,0.0,16467,1.0,0.0,0.0,0.0,0.0,0.7,255.000000,3.74,23.0,1024.0,77.50,58.000000,151.0,10.2
4,2105,2.0,0.0,21699,1.0,0.0,1.0,0.0,0.0,1.9,486.000000,3.54,74.0,1052.0,108.50,109.000000,151.0,11.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,3584,2.0,1.0,23612,1.0,0.0,0.0,0.0,0.0,0.8,231.000000,3.87,173.0,9009.8,127.71,96.000000,295.0,11.0
24996,3584,2.0,1.0,23612,1.0,0.0,0.0,0.0,0.0,0.8,231.000000,3.87,173.0,9009.8,127.71,96.000000,295.0,11.0
24997,971,2.0,1.0,16736,1.0,0.0,1.0,1.0,2.0,5.1,369.510563,3.23,18.0,790.0,179.80,124.702128,104.0,13.0
24998,3707,0.0,1.0,16990,1.0,0.0,1.0,0.0,0.0,0.8,315.000000,4.24,13.0,1637.0,170.50,70.000000,426.0,10.9


In [3]:
X = normalize(X)
y = df['Stage']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [4]:
from sklearn.tree import DecisionTreeClassifier

# Step 1: Initialise DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=2)

# Step 2: Fit model
clf.fit(X_train, y_train)


In [5]:
from sklearn.metrics import accuracy_score

# Step 3: Predict results
predictions = clf.predict(X_test)

# Step 4: Evaluate model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy", accuracy)

Accuracy 0.8888


In [6]:
from sklearn.metrics import f1_score

# Step 4: Evaluate model
accuracy = f1_score(y_test, predictions, average="macro")
print("Macro F1", accuracy)

Macro F1 0.8889162921368445


In [7]:
from sklearn.metrics import f1_score

# Step 4: Evaluate model
accuracy = f1_score(y_test, predictions, average="micro")
print("Micro F1", accuracy)

Micro F1 0.8888
