In [1]:
pip install transformers torch pandas sklearn datasets


Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to track which package uses
  [31m   [0m   'sklearn' instead of 'scikit-le

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import torch
import os
import joblib
from sklearn.metrics import accuracy_score

In [12]:
df = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')
len(df)

161297

In [3]:
# Load and prepare data
df = pd.read_csv('drugsComTrain_raw.tsv', sep='\t', nrows = 150000)
df = df[df['condition'].isin(['Birth Control', 'Depression', 'High Blood Pressure', 'Diabetes, Type 2'])]
df = df.dropna(subset=['review'])  # Ensuring no null reviews

# Preprocess text data
def preprocess_text(text):
    return text.str.replace('<[^<]+?>', '')  # Remove HTML tags

df['cleaned_text'] = preprocess_text(df['review'])

# Prepare labels
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['condition'])
labels = df['encoded_labels']

# Split data first
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], labels, test_size=0.2, random_state=42)

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=256)

# Create torch datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

# Model setup
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)
# Prepare to capture training and validation loss
training_loss_set = []
validation_loss_set = []

# Callback to Hugging Face Trainer to capture training loss after each logging step
class LossLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            training_loss_set.append(logs['loss'])
        if 'eval_loss' in logs:
            validation_loss_set.append(logs['eval_loss'])
# Define the compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

# Initialize Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(f"Test Loss: {results['eval_loss']}, Test Accuracy: {results['eval_accuracy']}")


  return text.str.replace('<[^<]+?>', '')  # Remove HTML tags
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bia

  0%|          | 0/9950 [00:00<?, ?it/s]

{'loss': 1.4026, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 1.3789, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 1.321, 'learning_rate': 3e-06, 'epoch': 0.02}
{'loss': 1.2319, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}
{'loss': 1.1078, 'learning_rate': 5e-06, 'epoch': 0.03}
{'loss': 0.963, 'learning_rate': 6e-06, 'epoch': 0.03}
{'loss': 0.9155, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.04}
{'loss': 0.8322, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.04}
{'loss': 0.8445, 'learning_rate': 9e-06, 'epoch': 0.05}
{'loss': 0.7102, 'learning_rate': 1e-05, 'epoch': 0.05}
{'loss': 0.6545, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.06}
{'loss': 0.5757, 'learning_rate': 1.2e-05, 'epoch': 0.06}
{'loss': 0.5467, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.07}
{'loss': 0.4704, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.07}
{'loss': 0.5061, 'learning_rate': 1.5e-05, 'epoch': 0.08}
{'loss': 0.4095, 'learnin

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.09325584024190903, 'eval_accuracy': 0.9766272932897713, 'eval_runtime': 2160.0997, 'eval_samples_per_second': 3.684, 'eval_steps_per_second': 0.058, 'epoch': 1.0}
{'loss': 0.0322, 'learning_rate': 4.2063492063492065e-05, 'epoch': 1.01}
{'loss': 0.0113, 'learning_rate': 4.2010582010582014e-05, 'epoch': 1.01}
{'loss': 0.0946, 'learning_rate': 4.1957671957671964e-05, 'epoch': 1.02}
{'loss': 0.142, 'learning_rate': 4.190476190476191e-05, 'epoch': 1.02}
{'loss': 0.0734, 'learning_rate': 4.185185185185185e-05, 'epoch': 1.03}
{'loss': 0.0378, 'learning_rate': 4.17989417989418e-05, 'epoch': 1.03}
{'loss': 0.1043, 'learning_rate': 4.174603174603175e-05, 'epoch': 1.04}
{'loss': 0.0868, 'learning_rate': 4.1693121693121694e-05, 'epoch': 1.04}
{'loss': 0.0935, 'learning_rate': 4.1640211640211644e-05, 'epoch': 1.05}
{'loss': 0.0926, 'learning_rate': 4.1587301587301594e-05, 'epoch': 1.05}
{'loss': 0.003, 'learning_rate': 4.153439153439154e-05, 'epoch': 1.06}
{'loss': 0.0649, 'learning

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.07694826275110245, 'eval_accuracy': 0.9832872581050516, 'eval_runtime': 382.1057, 'eval_samples_per_second': 20.827, 'eval_steps_per_second': 0.327, 'epoch': 2.0}
{'loss': 0.0232, 'learning_rate': 3.153439153439154e-05, 'epoch': 2.01}
{'loss': 0.0114, 'learning_rate': 3.148148148148148e-05, 'epoch': 2.01}
{'loss': 0.0415, 'learning_rate': 3.142857142857143e-05, 'epoch': 2.02}
{'loss': 0.0063, 'learning_rate': 3.1375661375661374e-05, 'epoch': 2.02}
{'loss': 0.0004, 'learning_rate': 3.1322751322751324e-05, 'epoch': 2.03}
{'loss': 0.0029, 'learning_rate': 3.1269841269841274e-05, 'epoch': 2.03}
{'loss': 0.0359, 'learning_rate': 3.121693121693122e-05, 'epoch': 2.04}
{'loss': 0.0258, 'learning_rate': 3.116402116402117e-05, 'epoch': 2.04}
{'loss': 0.016, 'learning_rate': 3.111111111111111e-05, 'epoch': 2.05}
{'loss': 0.0205, 'learning_rate': 3.105820105820106e-05, 'epoch': 2.05}
{'loss': 0.1145, 'learning_rate': 3.100529100529101e-05, 'epoch': 2.06}
{'loss': 0.0207, 'learning_

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.08927801251411438, 'eval_accuracy': 0.9846695149535059, 'eval_runtime': 358.5592, 'eval_samples_per_second': 22.194, 'eval_steps_per_second': 0.349, 'epoch': 3.0}
{'loss': 0.0005, 'learning_rate': 2.1005291005291007e-05, 'epoch': 3.01}
{'loss': 0.0062, 'learning_rate': 2.0952380952380954e-05, 'epoch': 3.01}
{'loss': 0.0001, 'learning_rate': 2.08994708994709e-05, 'epoch': 3.02}
{'loss': 0.0005, 'learning_rate': 2.0846560846560847e-05, 'epoch': 3.02}
{'loss': 0.0172, 'learning_rate': 2.0793650793650797e-05, 'epoch': 3.03}
{'loss': 0.0001, 'learning_rate': 2.074074074074074e-05, 'epoch': 3.03}
{'loss': 0.0002, 'learning_rate': 2.068783068783069e-05, 'epoch': 3.04}
{'loss': 0.0002, 'learning_rate': 2.0634920634920636e-05, 'epoch': 3.04}
{'loss': 0.0002, 'learning_rate': 2.0582010582010583e-05, 'epoch': 3.05}
{'loss': 0.0381, 'learning_rate': 2.052910052910053e-05, 'epoch': 3.05}
{'loss': 0.0004, 'learning_rate': 2.0476190476190476e-05, 'epoch': 3.06}
{'loss': 0.0169, 'learn

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.07114830613136292, 'eval_accuracy': 0.9876853480774064, 'eval_runtime': 346.1211, 'eval_samples_per_second': 22.992, 'eval_steps_per_second': 0.361, 'epoch': 4.0}
{'loss': 0.0423, 'learning_rate': 1.0476190476190477e-05, 'epoch': 4.01}
{'loss': 0.0001, 'learning_rate': 1.0423280423280423e-05, 'epoch': 4.01}
{'loss': 0.0005, 'learning_rate': 1.037037037037037e-05, 'epoch': 4.02}
{'loss': 0.0002, 'learning_rate': 1.0317460317460318e-05, 'epoch': 4.02}
{'loss': 0.0123, 'learning_rate': 1.0264550264550265e-05, 'epoch': 4.03}
{'loss': 0.0002, 'learning_rate': 1.0211640211640213e-05, 'epoch': 4.03}
{'loss': 0.0002, 'learning_rate': 1.015873015873016e-05, 'epoch': 4.04}
{'loss': 0.0001, 'learning_rate': 1.0105820105820106e-05, 'epoch': 4.04}
{'loss': 0.001, 'learning_rate': 1.0052910052910053e-05, 'epoch': 4.05}
{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 4.05}
{'loss': 0.0002, 'learning_rate': 9.947089947089947e-06, 'epoch': 4.06}
{'loss': 0.0001, 'learning_rate': 9.894

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.08163642883300781, 'eval_accuracy': 0.9876853480774064, 'eval_runtime': 347.1038, 'eval_samples_per_second': 22.927, 'eval_steps_per_second': 0.36, 'epoch': 5.0}
{'train_runtime': 29882.1646, 'train_samples_per_second': 5.326, 'train_steps_per_second': 0.333, 'train_loss': 0.05499702816711208, 'epoch': 5.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Test Loss: 0.08163642883300781, Test Accuracy: 0.9876853480774064


In [11]:
len(df)

39788

In [4]:
model.save_pretrained('./distilbert-drug-review-model')
tokenizer.save_pretrained('./distilbert-drug-review-tokenizer')


('./distilbert-drug-review-tokenizer/tokenizer_config.json',
 './distilbert-drug-review-tokenizer/special_tokens_map.json',
 './distilbert-drug-review-tokenizer/vocab.txt',
 './distilbert-drug-review-tokenizer/added_tokens.json')

In [5]:

print("Model directory contents:", os.listdir('./distilbert-drug-review-model'))
print("Tokenizer directory contents:", os.listdir('./distilbert-drug-review-tokenizer'))


Model directory contents: ['config.json', 'pytorch_model.bin']
Tokenizer directory contents: ['tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt']


In [6]:

# Load trained model and tokenizer
model_path = './distilbert-drug-review-model'
tokenizer_path = './distilbert-drug-review-tokenizer'

model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_path)


In [7]:
def predict_condition(texts):
    encoded_texts = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
    outputs = model(**encoded_texts)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return [label_encoder.classes_[pred] for pred in predictions.tolist()]  # Adjusted for direct use of classes

# Example usage
texts = [
    "I've been on birth control for two years with no side effects.",
    "This medication made my depression worse.",
    "Excellent control of blood pressure with this medication."
]
predicted_conditions = predict_condition(texts)
for text, pred in zip(texts, predicted_conditions):
    print(f"Text: {text}\nPredicted Condition: {pred}\n")


Text: I've been on birth control for two years with no side effects.
Predicted Condition: Birth Control

Text: This medication made my depression worse.
Predicted Condition: Depression

Text: Excellent control of blood pressure with this medication.
Predicted Condition: High Blood Pressure



In [8]:
joblib.dump(label_encoder, 'label_encoder.pkl')  # Saving
label_encoder = joblib.load('label_encoder.pkl')  # Loading


[]