In [1]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer, pipeline
from datasets import load_dataset



In [2]:
bio_asq = load_dataset("nehal69/bioAsq_Extractive_QA", field ="data", split="train")
bio_asq = bio_asq.train_test_split(test_size=0.2)
#squad = load_dataset("squad", split="train[:5000]")
#squad = squad.train_test_split(test_size=0.5)

Downloading and preparing dataset json/nehal69--bioAsq_Extractive_QA to /root/.cache/huggingface/datasets/json/nehal69--bioAsq_Extractive_QA-b396b9bd63f6af94/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.16M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/nehal69--bioAsq_Extractive_QA-b396b9bd63f6af94/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


In [3]:
bio_asq

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answers', 'context'],
        num_rows: 2612
    })
    test: Dataset({
        features: ['id', 'question', 'answers', 'context'],
        num_rows: 654
    })
})

In [19]:
import pandas as pd 
# Convert the dataset to a dictionary
data_dict = bio_asq["train"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)

In [22]:
df['question'][0]

'What is the characteristic feature of the Dyke-Davidoff-Masson syndrome.'

In [23]:
df['context'][0]

'Spontaneous resolution of invasive cerebral aspergillosis following partial resection in a medically untreated infant. Invasive craniocerebral aspergillosis, often encountered in an immunocompromised setting, is almost uniformly fatal despite radical surgical and medical management, and is frequently a necropsy finding. The authors report a unique, self-resolving clinical course of this aggressive infection in a 10-month-old infant. The infant was brought to the emergency services in altered sensorium with a 1-week history of left-sided hemiparesis, excessive irritability, and vomiting. An MRI study of the brain revealed multiple, heterogeneously enhancing lesions in the right cerebral hemisphere with mass effect. The largest lesion in the frontotemporal cortical and subcortical regions was decompressed on an emergent basis. Histopathological findings were suggestive of invasive aspergillosis, although there was no evidence of the infection in the lungs or paranasal sinuses. Computed 

In [21]:
df['answers'][0]

[{'answer_start': 1367, 'text': 'cerebral hemiatrophy'}]

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

questions = [q.strip() for q in df["question"]]
context = [q.strip() for q in df["context"]]
inputs = tokenizer(
        questions,
        context,
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

offset_mapping = inputs.pop("offset_mapping")

start_positions = []
end_positions = []
answers = df['answers']
for i, offset in enumerate(offset_mapping):
    answer = answers[i]
    start_char = answer[0]["answer_start"]
    end_char = answer[0]["answer_start"] + len(answer[0]["text"])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1
    
    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

df["start_positions"] = start_positions
df["end_positions"] = end_positions

import pandas as pd
from datasets import Dataset
data = {'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'start_positions':start_positions,
        'end_positions': end_positions,
       }
df = pd.DataFrame(data)
df.to_csv('encoding_train.csv',index=False)
train = Dataset.from_pandas(df)

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
import pandas as pd 
# Convert the dataset to a dictionary
data_dict = bio_asq["test"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)

questions = [q.strip() for q in df["question"]]
context = [q.strip() for q in df["context"]]
inputs = tokenizer(
        questions,
        context,
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

offset_mapping = inputs.pop("offset_mapping")

start_positions = []
end_positions = []
answers = df['answers']
for i, offset in enumerate(offset_mapping):
    answer = answers[i]
    start_char = answer[0]["answer_start"]
    end_char = answer[0]["answer_start"] + len(answer[0]["text"])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

df["start_positions"] = start_positions
df["end_positions"] = end_positions

data = {'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'start_positions':start_positions,
        'end_positions': end_positions,
       }
df = pd.DataFrame(data)
df.to_csv('encoding_test.csv',index=False)
test = Dataset.from_pandas(df)

In [8]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import DefaultDataCollator
from datasets import load_metric
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score



#model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

model = AutoModelForQuestionAnswering.from_pretrained("roberta-base")

data_collator = DefaultDataCollator()


# Your existing code to load model, data collator, etc.

training_args = TrainingArguments(
    output_dir="qa_model",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    report_to=[],
    logging_dir="./logs",
)

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids

    # Convert tuples to numpy arrays
    predictions = np.array(predictions)
    labels = np.array(labels)

    # Assuming your model outputs logits and you want to get predictions
    predictions = np.argmax(predictions, axis=2)
    labels = labels

    # Flatten the predictions and labels
    predictions = predictions.flatten()
    labels = labels.flatten()

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,3.315479,0.256881,0.188789,0.209095,0.177468
2,No log,2.447847,0.366972,0.263327,0.27933,0.24567
3,No log,2.116205,0.406728,0.305853,0.316384,0.281938
4,3.370500,1.916429,0.42737,0.321737,0.33512,0.296868
5,3.370500,1.826372,0.462538,0.373815,0.359657,0.335913
6,3.370500,1.786774,0.455657,0.357752,0.35668,0.325194
7,1.733500,1.741898,0.468654,0.385554,0.381498,0.352122
8,1.733500,1.760166,0.461009,0.367954,0.361999,0.337448
9,1.733500,1.71619,0.477064,0.403387,0.37891,0.358175
10,1.306900,1.729812,0.472477,0.381199,0.375074,0.346747


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

TrainOutput(global_step=3280, training_loss=1.52304488856618, metrics={'train_runtime': 2238.0314, 'train_samples_per_second': 23.342, 'train_steps_per_second': 1.466, 'total_flos': 1.023760692891648e+16, 'train_loss': 1.52304488856618, 'epoch': 20.0})

In [10]:
results = trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
results

{'eval_loss': 1.919829249382019,
 'eval_accuracy': 0.45489296636085624,
 'eval_precision': 0.369345620469896,
 'eval_recall': 0.35578363286316134,
 'eval_f1': 0.33308964409539144,
 'eval_runtime': 8.7223,
 'eval_samples_per_second': 74.98,
 'eval_steps_per_second': 4.701,
 'epoch': 20.0}

In [12]:
# Save the trained model
model.save_pretrained("./qa_model")

# Save the tokenizer
tokenizer.save_pretrained("./qa_model")

('./qa_model/tokenizer_config.json',
 './qa_model/special_tokens_map.json',
 './qa_model/vocab.txt',
 './qa_model/added_tokens.json',
 './qa_model/tokenizer.json')

In [27]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch

# Path to the directory where your trained RoBERTa model and tokenizer are saved
model_ = AutoModelForQuestionAnswering.from_pretrained("./qa_model")
tokenizer_ = AutoTokenizer.from_pretrained("./qa_model")

# Example usage for inference
question = "What is the characteristic feature of the Dyke-Davidoff-Masson syndrome."
context = "Spontaneous resolution of invasive cerebral aspergillosis following partial resection in a medically untreated infant. Invasive craniocerebral aspergillosis, often encountered in an immunocompromised setting, is almost uniformly fatal despite radical surgical and medical management, and is frequently a necropsy finding. The authors report a unique, self-resolving clinical course of this aggressive infection in a 10-month-old infant. The infant was brought to the emergency services in altered sensorium with a 1-week history of left-sided hemiparesis, excessive irritability, and vomiting. An MRI study of the brain revealed multiple, heterogeneously enhancing lesions in the right cerebral hemisphere with mass effect. The largest lesion in the frontotemporal cortical and subcortical regions was decompressed on an emergent basis. Histopathological findings were suggestive of invasive aspergillosis, although there was no evidence of the infection in the lungs or paranasal sinuses. Computed tomography-guided aspiration of the remaining lesions and follow-up antifungal therapy were recommended. The parents, however, requested discharge without further treatment. The child was seen at a follow-up visit 3 years later without having received any antifungal treatment. Imaging showed resolution of the infection and features of Dyke-Davidoff-Masson syndrome (cerebral hemiatrophy). This report of invasive cerebral aspergillosis resolving without medical therapy is the first of its kind. Its clinicoradiological aspects are discussed in light of previously reported cases."

# Tokenize the input
inputs = tokenizer(question, context, return_tensors="pt")

# Print tokenizer outputs
print("Input IDs:", inputs["input_ids"])
print("Token Type IDs:", inputs["token_type_ids"])
print("Attention Mask:", inputs["attention_mask"])

# Get model predictions
if "token_type_ids" in inputs:
    inputs.pop("token_type_ids")

# Pass inputs to the model
with torch.no_grad():
    outputs = model_(**inputs)

# Extract the answer
answer_start = torch.argmax(outputs.start_logits)  # Index of the start token
answer_end = torch.argmax(outputs.end_logits) + 1  # Index of the end token
answer_tokens = inputs["input_ids"][0][answer_start:answer_end]  # Extract token IDs
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)  # Convert to string

print(f"Question: {question}")
print(f"Answer: {answer}")

Input IDs: tensor([[  101,  2054,  2003,  1996,  8281,  3444,  1997,  1996, 22212,  1011,
          2585,  7245,  1011,  3742,  2239,  8715,  1012,   102, 17630,  5813,
          1997, 17503, 18439,  2004,  4842, 19791, 12650,  2206,  7704, 24501,
         18491,  1999,  1037,  2966,  2135,  4895,  7913,  4383, 10527,  1012,
         17503, 13675,  7088, 10085,  7869, 10024,  2140,  2004,  4842, 19791,
         12650,  1010,  2411,  8567,  1999,  2019, 10047, 23041, 24163,  8737,
         21716,  5084,  4292,  1010,  2003,  2471, 27423, 10611,  2750,  7490,
         11707,  1998,  2966,  2968,  1010,  1998,  2003,  4703,  1037, 26785,
         18981,  6508,  4531,  1012,  1996,  6048,  3189,  1037,  4310,  1010,
          2969,  1011, 29304,  6612,  2607,  1997,  2023,  9376,  8985,  1999,
          1037,  2184,  1011,  3204,  1011,  2214, 10527,  1012,  1996, 10527,
          2001,  2716,  2000,  1996,  5057,  2578,  1999,  8776, 13617,  5007,
          2007,  1037,  1015,  1011,  273

In [29]:
!zip -r medical_chatbot.zip /kaggle/working/qa_model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: kaggle/working/qa_model/ (stored 0%)
  adding: kaggle/working/qa_model/checkpoint-1500/ (stored 0%)
  adding: kaggle/working/qa_model/checkpoint-1500/training_args.bin (deflated 49%)
  adding: kaggle/working/qa_model/checkpoint-1500/tokenizer.json (deflated 71%)
  adding: kaggle/working/qa_model/checkpoint-1500/rng_state.pth (deflated 28%)
  adding: kaggle/working/qa_model/checkpoint-1500/vocab.txt (deflated 53%)
  adding: kaggle/working/qa_model/checkpoint-1500/scheduler.pt (deflated 49%)
  adding: kaggle/working/qa_model/checkpoint-1500/trainer_state.json (deflated 73%)
  adding: kaggle/working/qa_model/checkpoint-1500/tokenizer_config.json (deflated 43%)
  adding: kaggle/working/qa_model/checkpo

In [30]:
from IPython.display import FileLink 
FileLink(r'medical_chatbot.zip')

In [31]:
data = pd.read_csv("/kaggle/input/symptoms-dataset/Training (1).csv")

In [78]:
np.unique(data.columns)

array(['abdominal_pain', 'abnormal_menstruation', 'acidity',
       'acute_liver_failure', 'altered_sensorium', 'anxiety', 'back_pain',
       'belly_pain', 'blackheads', 'bladder_discomfort', 'blister',
       'blood_in_sputum', 'bloody_stool', 'blurred_and_distorted_vision',
       'breathlessness', 'brittle_nails', 'bruising',
       'burning_micturition', 'chest_pain', 'chills',
       'cold_hands_and_feets', 'coma', 'congestion', 'constipation',
       'continuous_feel_of_urine', 'continuous_sneezing', 'cough',
       'cramps', 'dark_urine', 'dehydration', 'depression', 'diarrhoea',
       'dischromic _patches', 'distention_of_abdomen', 'dizziness',
       'drying_and_tingling_lips', 'enlarged_thyroid', 'excessive_hunger',
       'extra_marital_contacts', 'family_history', 'fast_heart_rate',
       'fatigue', 'fluid_overload', 'fluid_overload.1',
       'foul_smell_of urine', 'headache', 'high_fever', 'hip_joint_pain',
       'history_of_alcohol_consumption', 'increased_appetite',

In [33]:
df = pd.DataFrame(data)

In [34]:


cols = df.columns


In [37]:

x = df[cols]
y = df['prognosis']

In [39]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


In [57]:
# Clean the labels in the training data
df['prognosis'] = df['prognosis'].str.strip()

# Clean the labels in the test data
test_data['prognosis'] = test_data['prognosis'].str.strip()

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection

# Assuming df is your DataFrame
cols = df.columns[:-1]  # All columns except the last one (prognosis)
x = df[cols]
y = df['prognosis']

# Convert categorical labels to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [56]:
np.unique(y)

array(['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne',
       'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma',
       'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis',
       'Common Cold', 'Dengue', 'Diabetes ',
       'Dimorphic hemmorhoids(piles)', 'Drug Reaction',
       'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Hypertension ', 'Hyperthyroidism', 'Hypoglycemia',
       'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine',
       'Osteoarthristis', 'Paralysis (brain hemorrhage)',
       'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis',
       'Typhoid', 'Urinary tract infection', 'Varicose veins',
       'hepatitis A'], dtype=object)

In [45]:

x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.33, random_state=42)

In [62]:

mnb = MultinomialNB()
mnb = mnb.fit(x_train, y_train)

In [63]:
accuracy = mnb.score(x_test, y_test)
print(f"Accuracy: {accuracy}")

# Cross-validation
print("Cross result========")
scores = model_selection.cross_val_score(mnb, x_test, y_test, cv=3)
print(scores)
print(scores.mean())



Accuracy: 1.0
[1. 1. 1.]
1.0


In [65]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib  # For saving models

In [69]:
from sklearn.model_selection import cross_val_score

In [70]:
# Perform cross-validation for Decision Tree
dt = DecisionTreeClassifier(min_samples_split=20, random_state=42)
dt_scores = cross_val_score(dt, x, y_encoded, cv=5, scoring='accuracy')  # 5-fold cross-validation

# Print cross-validation results
print("Decision Tree Cross-Validation Scores:", dt_scores)
print("Decision Tree Mean Accuracy:", dt_scores.mean())

Decision Tree Cross-Validation Scores: [0.97865854 0.97865854 0.99186992 1.         1.        ]
Decision Tree Mean Accuracy: 0.9898373983739838


In [71]:
# Perform cross-validation for SVM
svm = SVC(kernel='linear', random_state=42)  # You can change the kernel to 'rbf' or 'poly'
svm_scores = cross_val_score(svm, x, y_encoded, cv=5, scoring='accuracy')  # 5-fold cross-validation

# Print cross-validation results
print("SVM Cross-Validation Scores:", svm_scores)
print("SVM Mean Accuracy:", svm_scores.mean())

SVM Cross-Validation Scores: [1. 1. 1. 1. 1.]
SVM Mean Accuracy: 1.0


In [72]:
# Perform cross-validation for Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = cross_val_score(rf, x, y_encoded, cv=5, scoring='accuracy')  # 5-fold cross-validation

# Print cross-validation results
print("Random Forest Cross-Validation Scores:", rf_scores)
print("Random Forest Mean Accuracy:", rf_scores.mean())

Random Forest Cross-Validation Scores: [1. 1. 1. 1. 1.]
Random Forest Mean Accuracy: 1.0


In [73]:
# Train and save Decision Tree
dt.fit(x, y_encoded)
joblib.dump(dt, 'decision_tree_model.pkl')

# Train and save SVM
svm.fit(x, y_encoded)
joblib.dump(svm, 'svm_model.pkl')

# Train and save Random Forest
rf.fit(x, y_encoded)
joblib.dump(rf, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [74]:
# Load the models
dt_loaded = joblib.load('decision_tree_model.pkl')
svm_loaded = joblib.load('svm_model.pkl')
rf_loaded = joblib.load('random_forest_model.pkl')

# Make predictions
y_pred_dt = dt_loaded.predict(x_test)
y_pred_svm = svm_loaded.predict(x_test)
y_pred_rf = rf_loaded.predict(x_test)

# Evaluate loaded models
print("Decision Tree Test Accuracy:", accuracy_score(y_test, y_pred_dt))
print("SVM Test Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_pred_rf))

Decision Tree Test Accuracy: 0.9932266009852216
SVM Test Accuracy: 1.0
Random Forest Test Accuracy: 1.0


In [75]:
# Load the models
dt_loaded = joblib.load('decision_tree_model.pkl')
svm_loaded = joblib.load('svm_model.pkl')
rf_loaded = joblib.load('random_forest_model.pkl')

# Make predictions
y_pred_dt = dt_loaded.predict(x_test)
y_pred_svm = svm_loaded.predict(x_test)
y_pred_rf = rf_loaded.predict(x_test)

# Evaluate loaded models
print("Decision Tree Test Accuracy:", accuracy_score(y_test, y_pred_dt))
print("SVM Test Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_pred_rf))

Decision Tree Test Accuracy: 0.9932266009852216
SVM Test Accuracy: 1.0
Random Forest Test Accuracy: 1.0


In [76]:
import joblib

# Save the LabelEncoder
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']