**Support Vector Machine (SVM)**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

import os
for dirname, _, filenames in os.walk('/content/healthcare_dataset.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Some utility functions

def load_data(path):
    print("Current Directory: ", os.getcwd())
    try:
        df = pd.read_csv(path)
        return df
    except FileNotFoundError:
        return "file not found"
    except FileExistsError:
        return "file does not exist"

def drop_cols(df, cols):
    '''
    Calls pd.drop() but checks if columns exist first so that we don't have to go back and reload the dataset everytime we edit this cell and rerun.

    Note: There is probably an argument you can pass in to df.drop() to achieve the same goal but I just wanted to do this for some reason
    '''
    if (len(list(set(df.columns).difference(cols))) != len(df.columns) - len(cols)):
        # reload data ourselves
        df = load_data('/content/healthcare_dataset.csv"')
    else:
        df = df.drop(cols, axis=1)
    return df

In [None]:
df = pd.read_csv("/content/healthcare_dataset.csv")
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Tiffany Ramirez,81,Female,O-,Diabetes,2022-11-17,Patrick Parker,Wallace-Hamilton,Medicare,37490.983364,146,Elective,2022-12-01,Aspirin,Inconclusive
1,Ruben Burns,35,Male,O+,Asthma,2023-06-01,Diane Jackson,"Burke, Griffin and Cooper",UnitedHealthcare,47304.064845,404,Emergency,2023-06-15,Lipitor,Normal
2,Chad Byrd,61,Male,B-,Obesity,2019-01-09,Paul Baker,Walton LLC,Medicare,36874.896997,292,Emergency,2019-02-08,Lipitor,Normal
3,Antonio Frederick,49,Male,B-,Asthma,2020-05-02,Brian Chandler,Garcia Ltd,Medicare,23303.322092,480,Urgent,2020-05-03,Penicillin,Abnormal
4,Mrs. Brandy Flowers,51,Male,O-,Arthritis,2021-07-09,Dustin Griffin,"Jones, Brown and Murray",UnitedHealthcare,18086.344184,477,Urgent,2021-08-02,Paracetamol,Normal


In [None]:
# split into features and target
X = df.loc[:, 'Name':'Medication']
y = df['Test Results']
X.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication
0,Tiffany Ramirez,81,Female,O-,Diabetes,2022-11-17,Patrick Parker,Wallace-Hamilton,Medicare,37490.983364,146,Elective,2022-12-01,Aspirin
1,Ruben Burns,35,Male,O+,Asthma,2023-06-01,Diane Jackson,"Burke, Griffin and Cooper",UnitedHealthcare,47304.064845,404,Emergency,2023-06-15,Lipitor
2,Chad Byrd,61,Male,B-,Obesity,2019-01-09,Paul Baker,Walton LLC,Medicare,36874.896997,292,Emergency,2019-02-08,Lipitor
3,Antonio Frederick,49,Male,B-,Asthma,2020-05-02,Brian Chandler,Garcia Ltd,Medicare,23303.322092,480,Urgent,2020-05-03,Penicillin
4,Mrs. Brandy Flowers,51,Male,O-,Arthritis,2021-07-09,Dustin Griffin,"Jones, Brown and Murray",UnitedHealthcare,18086.344184,477,Urgent,2021-08-02,Paracetamol


In [None]:
# lets get rid of things that probably don't have any relationship with the target data
X = drop_cols(df, ['Name', 'Doctor', 'Hospital', 'Discharge Date', 'Date of Admission', 'Insurance Provider'])

# one hot encoding on the remaining categorial features
print("Original features:\n", list(X.columns),"\n")
X_encoded = pd.get_dummies(X.loc[:, 'Age':'Medication'])
print("one hot encoded features:\n", list(X_encoded.columns), "\n")

Original features:
 ['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Billing Amount', 'Room Number', 'Admission Type', 'Medication', 'Test Results'] 

one hot encoded features:
 ['Age', 'Billing Amount', 'Room Number', 'Gender_Female', 'Gender_Male', 'Blood Type_A+', 'Blood Type_A-', 'Blood Type_AB+', 'Blood Type_AB-', 'Blood Type_B+', 'Blood Type_B-', 'Blood Type_O+', 'Blood Type_O-', 'Medical Condition_Arthritis', 'Medical Condition_Asthma', 'Medical Condition_Cancer', 'Medical Condition_Diabetes', 'Medical Condition_Hypertension', 'Medical Condition_Obesity', 'Admission Type_Elective', 'Admission Type_Emergency', 'Admission Type_Urgent', 'Medication_Aspirin', 'Medication_Ibuprofen', 'Medication_Lipitor', 'Medication_Paracetamol', 'Medication_Penicillin'] 



In [None]:
# display encoded dataframe
X_encoded.head()

Unnamed: 0,Age,Billing Amount,Room Number,Gender_Female,Gender_Male,Blood Type_A+,Blood Type_A-,Blood Type_AB+,Blood Type_AB-,Blood Type_B+,...,Medical Condition_Hypertension,Medical Condition_Obesity,Admission Type_Elective,Admission Type_Emergency,Admission Type_Urgent,Medication_Aspirin,Medication_Ibuprofen,Medication_Lipitor,Medication_Paracetamol,Medication_Penicillin
0,81,37490.983364,146,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,35,47304.064845,404,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,61,36874.896997,292,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,49,23303.322092,480,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,51,18086.344184,477,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [None]:
# label encode the target data
le = LabelEncoder()
le.fit(y)
y_encoded = le.transform(y)
y_encoded

array([1, 2, 2, ..., 2, 2, 0])

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y_encoded, test_size=0.2, stratify=y)

In [None]:
# Support Vector Machine
'''
An SVM maps training examples to points in space to maximize gap between two categories. New examples are then mapped into the same space and predicted to belong to a category based on which side of the gap they fall
'''
svm_classifier = svm.SVC()
svm_classifier.fit(X_train, y_train)
# show accuracy
svm_accuracy = cross_val_score(svm_classifier, X_test, y_test, cv=8).mean()
print("cross validation score: ", svm_accuracy)

cross validation score:  0.3455


Fine Tuning

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import svm
from sklearn.model_selection import GridSearchCV

# Define grid parameters
parameter_grid = {'C': [0.5,1,10],
     'gamma': ['scale','auto'],
     'kernel': ['rbf']}

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_normalized = scaler.fit_transform(X_train)
x_test_normalized = scaler.transform(X_test)

In [None]:
# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=svm_classifier,param_grid=parameter_grid)

# Using normalized training data
grid_search.fit(x_train_normalized, y_train)

In [None]:
# Make predictions on test data
predictions = grid_search.predict(x_test_normalized)

# Print classification report by resolving zero_division
print("Classification Report:")
print(classification_report(y_test, predictions, zero_division=1))  # Mengatur zero_division menjadi 1

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.45      0.39       691
           1       0.36      0.31      0.33       656
           2       0.33      0.28      0.30       653

    accuracy                           0.35      2000
   macro avg       0.34      0.34      0.34      2000
weighted avg       0.35      0.35      0.34      2000

Confusion Matrix:
[[309 187 195]
 [281 203 172]
 [292 181 180]]


**Random Forest**

In [None]:
df

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Tiffany Ramirez,81,Female,O-,Diabetes,2022-11-17,Patrick Parker,Wallace-Hamilton,Medicare,37490.983364,146,Elective,2022-12-01,Aspirin,Inconclusive
1,Ruben Burns,35,Male,O+,Asthma,2023-06-01,Diane Jackson,"Burke, Griffin and Cooper",UnitedHealthcare,47304.064845,404,Emergency,2023-06-15,Lipitor,Normal
2,Chad Byrd,61,Male,B-,Obesity,2019-01-09,Paul Baker,Walton LLC,Medicare,36874.896997,292,Emergency,2019-02-08,Lipitor,Normal
3,Antonio Frederick,49,Male,B-,Asthma,2020-05-02,Brian Chandler,Garcia Ltd,Medicare,23303.322092,480,Urgent,2020-05-03,Penicillin,Abnormal
4,Mrs. Brandy Flowers,51,Male,O-,Arthritis,2021-07-09,Dustin Griffin,"Jones, Brown and Murray",UnitedHealthcare,18086.344184,477,Urgent,2021-08-02,Paracetamol,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,James Hood,83,Male,A+,Obesity,2022-07-29,Samuel Moody,"Wood, Martin and Simmons",UnitedHealthcare,39606.840083,110,Elective,2022-08-02,Ibuprofen,Abnormal
9996,Stephanie Evans,47,Female,AB+,Arthritis,2022-01-06,Christopher Yates,Nash-Krueger,Blue Cross,5995.717488,244,Emergency,2022-01-29,Ibuprofen,Normal
9997,Christopher Martinez,54,Male,B-,Arthritis,2022-07-01,Robert Nicholson,Larson and Sons,Blue Cross,49559.202905,312,Elective,2022-07-15,Ibuprofen,Normal
9998,Amanda Duke,84,Male,A+,Arthritis,2020-02-06,Jamie Lewis,Wilson-Lyons,UnitedHealthcare,25236.344761,420,Urgent,2020-02-26,Penicillin,Normal


In [None]:
X=df.drop(['Test Results'],axis=1)
y=df['Test Results']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 26)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
for col in df.columns:
    if col!='Age':
        df[col]=lc.fit_transform(df[col])
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,8837,81,0,7,3,1468,7167,7960,3,7439,45,0,1476,0,1
1,7736,35,1,6,1,1664,2597,978,4,9444,303,1,1672,2,2
2,1508,61,1,5,5,71,7180,7996,3,7329,191,1,96,2,2
3,721,49,1,5,1,548,1169,2482,3,4589,379,2,544,4,0
4,6782,51,1,7,0,980,2775,3908,4,3469,376,2,996,3,2


In [None]:
X,y=df.drop(['Test Results'],axis=1), df['Test Results']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape, y_test.shape

((8000, 14), (2000,))

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)
rfc.score(X_test,y_test)

0.3195

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier()
random.fit(X_train,y_train)


# Tentukan parameter grid
parameter_grid = {
    'max_depth': [2, 4, 5, 8, 10, 15, 20, 25],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_normalized = scaler.fit_transform(X_train)
x_test_normalized = scaler.transform(X_test)

In [None]:
# Membuat objek GridSearchCV
grid_search = GridSearchCV(estimator=random,param_grid=parameter_grid)

# Menggunakan data latih yang sudah dinormalisasi
grid_search.fit(x_train_normalized, y_train)

In [None]:
# Lakukan prediksi pada data uji
predictions = grid_search.predict(x_test_normalized)

# Cetak classification report dengan mengatasi zero_division
print("Classification Report:")
print(classification_report(y_test, predictions, zero_division=1))  # Mengatur zero_division menjadi 1

# Cetak confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.95      0.49       661
           1       0.14      0.00      0.01       664
           2       0.34      0.03      0.06       675

    accuracy                           0.33      2000
   macro avg       0.27      0.33      0.19      2000
weighted avg       0.27      0.33      0.18      2000

Confusion Matrix:
[[631   5  25]
 [642   2  20]
 [645   7  23]]
