## Python / Models / Hepatitis dataset

### Importing libraries

In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

### Importing and cleaning dataset

In [47]:
# Reading dataset into a pandas dataframe

col = ['Class', 'AGE', 'SEX', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'BILIRUBIN', 'ALK PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME', 'HISTOLOGY']
df = pd.read_csv('hepatitis.csv')
df.columns = col
df

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.90,135,42,3.5,?,1
1,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.70,96,32,4.0,?,1
2,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.70,46,52,4.0,80,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.00,?,200,4.0,?,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.90,95,28,4.0,75,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,1,46,1,2,2,1,1,1,2,2,2,1,1,1,7.60,?,242,3.3,50,2
150,2,44,1,2,2,1,2,2,2,1,2,2,2,2,0.90,126,142,4.3,?,2
151,2,61,1,1,2,1,1,2,1,1,2,1,2,2,0.80,75,20,4.1,?,2
152,2,53,2,1,2,1,2,2,2,2,1,1,2,1,1.50,81,19,4.1,48,2


In [48]:
# Checking for missing values

df.isna().sum()

Class              0
AGE                0
SEX                0
STEROID            0
ANTIVIRALS         0
FATIGUE            0
MALAISE            0
ANOREXIA           0
LIVER BIG          0
LIVER FIRM         0
SPLEEN PALPABLE    0
SPIDERS            0
ASCITES            0
VARICES            0
BILIRUBIN          0
ALK PHOSPHATE      0
SGOT               0
ALBUMIN            0
PROTIME            0
HISTOLOGY          0
dtype: int64

In [49]:
# Replacing '?' with NaN values and dropping the rows with NaN values

df.replace('?', np.nan, inplace = True)
df.dropna(inplace = True)

In [50]:
# Converting columns to numeric

df.apply(pd.to_numeric, errors = 'ignore')
df

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.90,95,28,4.0,75,1
9,2,39,1,1,1,2,2,2,1,1,2,2,2,2,1.30,78,30,4.4,85,1
10,2,32,1,2,1,1,2,2,2,1,2,1,2,2,1.00,59,249,3.7,54,1
11,2,41,1,2,1,1,2,2,2,1,2,2,2,2,0.90,81,60,3.9,52,1
12,2,30,1,2,2,1,2,2,2,1,2,2,2,2,2.20,57,144,4.9,78,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,2,45,1,2,1,2,2,2,2,2,2,2,2,2,1.30,85,44,4.2,85,2
142,1,49,1,1,2,1,1,2,2,2,1,1,2,2,1.40,85,70,3.5,35,2
144,2,31,1,1,2,1,2,2,2,2,2,2,2,2,1.20,75,173,4.2,54,2
152,2,53,2,1,2,1,2,2,2,2,1,1,2,1,1.50,81,19,4.1,48,2


In [51]:
# Data transformation

categorical_cols = ['Class', 'SEX', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'HISTOLOGY']
encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

### Data analysis

In [52]:
# Creating two variables for analysis

x = df.drop('Class', axis = 1)
y = df['Class']

In [53]:
# Scaling data

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled

array([[-0.59437209, -0.3992747 ,  0.95118973, ...,  0.27284058,
         0.53638413, -0.83793058],
       [-0.14831424, -0.3992747 , -1.05131497, ...,  0.97131247,
         0.96592097, -0.83793058],
       [-0.77279523, -0.3992747 ,  0.95118973, ..., -0.25101333,
        -0.36564324, -0.83793058],
       ...,
       [-0.8620068 , -0.3992747 , -1.05131497, ...,  0.62207653,
        -0.36564324,  1.19341628],
       [ 1.10064775,  2.50454133, -1.05131497, ...,  0.44745855,
        -0.62336534,  1.19341628],
       [ 0.20853205, -0.3992747 ,  0.95118973, ..., -1.29872117,
        -0.88108745,  1.19341628]])

In [54]:
# Creating train and test sets

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.2, random_state = 42)

In [55]:
# Logistic Regression

model1 = LogisticRegression()
model1.fit(x_train, y_train)
model1.predict(x_test)
print(accuracy_score(y_test, model1.predict(x_test)))

0.875


In [56]:
# Naive Bayes

model2 = GaussianNB()
model2.fit(x_train, y_train)
model2.predict(x_test)
print(accuracy_score(y_test, model2.predict(x_test)))

0.8125
