## Hepatitis data set

## Decision Tree Classifier

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.impute import SimpleImputer
import random
import numpy as np
np.random.seed(0)

dataset_files = ['hepatitis2_outlier_iqr_median.csv',
                 'hepatitis4_outlier_stdmean_median.csv',
                 'hepatitis5_outlier_percentile_median.csv',
                 'hepatitis6_outlier_percentile_remove.csv']

for dataset_file in dataset_files:
    df = pd.read_csv(dataset_file)
    X = df.drop('class', axis=1)  
    y = df['class']

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, random_state=42)
    
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train_imputed, y_train)

    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    #print(f"\nDecision Tree Classifier:")
    print(f"{dataset_file}")
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1-Score: {f1:.2f}\n')


hepatitis2_outlier_iqr_median.csv
[[ 3  6]
 [ 7 31]]
              precision    recall  f1-score   support

           1       0.30      0.33      0.32         9
           2       0.84      0.82      0.83        38

    accuracy                           0.72        47
   macro avg       0.57      0.57      0.57        47
weighted avg       0.73      0.72      0.73        47

Accuracy: 0.72
F1-Score: 0.32

hepatitis4_outlier_stdmean_median.csv
[[ 1  8]
 [ 6 32]]
              precision    recall  f1-score   support

           1       0.14      0.11      0.12         9
           2       0.80      0.84      0.82        38

    accuracy                           0.70        47
   macro avg       0.47      0.48      0.47        47
weighted avg       0.67      0.70      0.69        47

Accuracy: 0.70
F1-Score: 0.12

hepatitis5_outlier_percentile_median.csv
[[ 1  8]
 [ 6 32]]
              precision    recall  f1-score   support

           1       0.14      0.11      0.12         9
     

## Random Forest Classifier

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.impute import SimpleImputer
import random
import numpy as np
np.random.seed(0)

dataset_files = ['hepatitis2_outlier_iqr_median.csv',
                 'hepatitis4_outlier_stdmean_median.csv',
                 'hepatitis5_outlier_percentile_median.csv',
                 'hepatitis6_outlier_percentile_remove.csv']

for dataset_file in dataset_files:
    df = pd.read_csv(dataset_file)

    X = df.drop('class', axis=1)  
    y = df['class']

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, random_state=42)
    
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    model = RandomForestClassifier()
    model.fit(X_train_imputed, y_train)

    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\nRandom Forest Classifier:")
    print(f"{dataset_file}")
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1-Score: {f1:.2f}\n')



Random Forest Classifier:
hepatitis2_outlier_iqr_median.csv
[[ 2  7]
 [ 5 33]]
              precision    recall  f1-score   support

           1       0.29      0.22      0.25         9
           2       0.82      0.87      0.85        38

    accuracy                           0.74        47
   macro avg       0.56      0.55      0.55        47
weighted avg       0.72      0.74      0.73        47

Accuracy: 0.74
F1-Score: 0.25


Random Forest Classifier:
hepatitis4_outlier_stdmean_median.csv
[[ 1  8]
 [ 2 36]]
              precision    recall  f1-score   support

           1       0.33      0.11      0.17         9
           2       0.82      0.95      0.88        38

    accuracy                           0.79        47
   macro avg       0.58      0.53      0.52        47
weighted avg       0.73      0.79      0.74        47

Accuracy: 0.79
F1-Score: 0.17


Random Forest Classifier:
hepatitis5_outlier_percentile_median.csv
[[ 1  8]
 [ 3 35]]
              precision    recall

## Logistic Regression

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import random
import numpy as np
np.random.seed(0)
import warnings
warnings.filterwarnings("ignore")

dataset_files = ['hepatitis2_outlier_iqr_median.csv',
                 'hepatitis4_outlier_stdmean_median.csv',
                 'hepatitis5_outlier_percentile_median.csv',
                 'hepatitis6_outlier_percentile_remove.csv']

for dataset_file in dataset_files:
    df = pd.read_csv(dataset_file)

    X = df.drop('class', axis=1)  
    y = df['class']

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, random_state=42)
     
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    model = LogisticRegression()
    model.fit(X_train_imputed, y_train)

    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\nLogistic Regression:")
    print(f"{dataset_file}")
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1-Score: {f1:.2f}\n')
    


Logistic Regression:
hepatitis2_outlier_iqr_median.csv
[[ 4  5]
 [ 4 34]]
              precision    recall  f1-score   support

           1       0.50      0.44      0.47         9
           2       0.87      0.89      0.88        38

    accuracy                           0.81        47
   macro avg       0.69      0.67      0.68        47
weighted avg       0.80      0.81      0.80        47

Accuracy: 0.81
F1-Score: 0.47


Logistic Regression:
hepatitis4_outlier_stdmean_median.csv
[[ 2  7]
 [ 3 35]]
              precision    recall  f1-score   support

           1       0.40      0.22      0.29         9
           2       0.83      0.92      0.88        38

    accuracy                           0.79        47
   macro avg       0.62      0.57      0.58        47
weighted avg       0.75      0.79      0.76        47

Accuracy: 0.79
F1-Score: 0.29


Logistic Regression:
hepatitis5_outlier_percentile_median.csv
[[ 2  7]
 [ 3 35]]
              precision    recall  f1-score   su