In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diseasetreatmentdata/data.json


In [3]:
import json

# Load JSON file
with open(os.path.join(dirname, filename), 'r') as f:
    data = json.load(f)

In [16]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Initialize lists to hold our sentences (intents), labels (diseases), and responses
sentences = []
labels = []
responses = []

# Fill lists with data from JSON
for intent in data["intents"]:
    for pattern in intent["symptoms"]:
        sentences.append(pattern)
        labels.append(intent["name"])
        responses.append(intent["response"])

# Load the ClinicalBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=len(set(labels)))
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Tokenize our sentences
inputs = tokenizer(sentences, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Encode our labels
encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

# Split the data into training and validation datasets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(inputs.input_ids, labels, test_size=0.3)

# Create TensorDatasets for the training and validation sets
train_dataset = TensorDataset(train_inputs, inputs.attention_mask[:len(train_inputs)], torch.tensor(train_labels))
validation_dataset = TensorDataset(validation_inputs, inputs.attention_mask[len(train_inputs):], torch.tensor(validation_labels))

# Create DataLoaders for the training and validation sets
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
validation_dataloader = DataLoader(validation_dataset, sampler=SequentialSampler(validation_dataset), batch_size=32)

# Define our optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 200
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
model.eval()
total_eval_accuracy = 0
for batch in validation_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    with torch.no_grad():        
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    correct_predictions = predictions == labels
    total_eval_accuracy += correct_predictions.sum().item()

model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [17]:
import os
import pickle

# Save the encoder
with open(os.path.join(os.getcwd(), "label_encoder.pkl"), "wb") as f:
    pickle.dump(encoder, f)


In [24]:
import torch
from transformers import pipeline
import random

with open(os.path.join(os.getcwd(), "label_encoder.pkl"), "rb") as f:
    encoder = pickle.load(f)
# Confidence threshold
confidence_threshold = 0.062

# User's symptoms
user_symptoms = ""

# Create a pipeline for classification
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

while True:
    # User enters a symptom
    user_symptom = input("Please describe your symptom: ")

    # Add symptom to user's symptoms
    user_symptoms += " " + user_symptom

    # Predict the class of the symptoms
    result = classifier(user_symptoms)

    # Calculate the confidence of the prediction
    max_proba = result[0]['score']

    print(f"Current confidence level is: {max_proba}")

    if max_proba >= confidence_threshold:
        # Decode the class to get the original label (intent)
        class_index = int(result[0]['label'].split("_")[-1])

        # Get the disease name from the class index
        predicted_disease = encoder.inverse_transform([class_index])[0]

        # Output the response
        print(f"You may be suffering from: {predicted_disease}")
        break
    else:
        common_symptoms_response = ["Your symptoms are quite common, could you please provide more details or any other symptom?", 
                                    "That is a very common symptom, provide me with another symptom you are facing",
                                    "That's a quite common symptom for many diseases, provide me with any another symptom you are having?"]
        print("Chatbot: ", random.choice(common_symptoms_response))


Please describe your symptom:  Muscle weakness or spasms",         "Blurred or double vision


Current confidence level is: 0.04914657026529312
Chatbot:  That is a very common symptom, provide me with another symptom you are facing


Please describe your symptom:  "Fatigue",         "Numbness or tingling in the limbs


Current confidence level is: 0.09269026666879654
You may be suffering from: Sciatica


In [None]:
pip install --upgrade transformers

In [None]:
pip install --upgrade accelerate