In [2]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [4]:
# Import necessary libraries
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Load data
data_path = "/content/preprocessed_data.csv"
data = pd.read_csv(data_path)

# Step 1: Data quality checks
data = data.dropna(subset=['text', 'symptoms'])
data = data[(data['text'].str.strip() != '') & (data['symptoms'].str.strip() != '')]
data['text'] = data['text'].astype(str)
data['symptoms'] = data['symptoms'].astype(str)
data.reset_index(drop=True, inplace=True)

# Step 2: Tokenizer and label preparation
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Function to tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['text'], padding=True, truncation=True, max_length=128, return_tensors="pt")
    labels = []

    for i, label in enumerate(examples['symptoms']):
        label_list = label.split()  # Split symptoms into tokens
        label_ids = [1 if word in label_list else 0 for word in tokenizer.tokenize(examples['text'][i])]
        label_ids = [0] + label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] + [0]  # Align to token length
        labels.append(label_ids + [0] * (len(tokenized_inputs['input_ids'][i]) - len(label_ids)))  # Pad labels

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization
dataset = Dataset.from_pandas(data)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Step 3: Define the model and training arguments
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Skips evaluation
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Step 4: Train the model
trainer.train()

# Step 5: Test prediction on a few examples
def extract_symptoms(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    # Collect symptoms based on predictions
    extracted_symptoms = [tokens[i] for i, pred in enumerate(predictions[0]) if pred == 1]
    return extracted_symptoms

# Test cases
test_sentences = [
    "I have a fever and sore throat.",
    "My asthma is causing me breathing problems and wheezing.",
    "I feel weak and have severe headaches."
]

for sentence in test_sentences:
    print(f"Input Sentence: {sentence}")
    symptoms = extract_symptoms(sentence)
    print(f"Extracted Symptoms: {symptoms}")




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1432 > 512). Running this sequence through the model will result in indexing errors
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Input Sentence: I have a fever and sore throat.
Extracted Symptoms: []
Input Sentence: My asthma is causing me breathing problems and wheezing.
Extracted Symptoms: ['causing', 'breathing', 'problems']
Input Sentence: I feel weak and have severe headaches.
Extracted Symptoms: ['weak', 'severe']
