## Here We test our clustering model on UCI Drug Review dataset just like we did with our sentiment analysis model.

Read more about the data [here](https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018)

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/NLP_DrugReviews/UCI_drug_review_sentiment_labeled.csv")
df.dropna(inplace=True)

In [None]:
df

Unnamed: 0,drugName,condition,review,predicted_sentiment,sentiment_Label
0,Mirtazapine,Depression,"""I've tried a few antidepressants over the yea...",0,Negative
1,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn's disease and has done very ...",2,Positive
2,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",1,Neutral
3,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",2,Positive
4,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",2,Positive
...,...,...,...,...,...
53466,Tamoxifen,"Breast Cancer, Prevention","""I have taken Tamoxifen for 5 years. Side effe...",0,Negative
53467,Escitalopram,Anxiety,"""I've been taking Lexapro (escitaploprgram) si...",2,Positive
53468,Levonorgestrel,Birth Control,"""I'm married, 34 years old and I have no kids....",0,Negative
53469,Tapentadol,Pain,"""I was prescribed Nucynta for severe neck/shou...",0,Negative


In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertModel

In [None]:
class DistilBERTClassifier(nn.Module):
    def __init__(self):
        super(DistilBERTClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 3)  # 3 classes

    def forward(self, input_ids, attention_mask):
        output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output[0]
        pooled_output = hidden_state[:, 0]  # Take the [CLS] token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

model = DistilBERTClassifier()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP_DrugReviews/cluster_distilbert_model.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
model.eval()

DistilBERTClassifier(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin

In [None]:
# Define tokenizer
from transformers import DistilBertTokenizer, AdamW

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import pandas as pd

# Tokenize the input texts
encodings = tokenizer(df['review'].tolist(), truncation=True, padding=True)

# Convert tokenized inputs to PyTorch tensors
input_ids = torch.tensor(encodings['input_ids'])
attention_masks = torch.tensor(encodings['attention_mask'])

In [None]:
# Define DataLoader for new data
from torch.utils.data import DataLoader, TensorDataset

new_data = TensorDataset(input_ids, attention_masks)
new_dataloader = DataLoader(new_data, batch_size=8, shuffle=False)

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Predict clusters
predictions = []
with torch.no_grad():
    for batch in new_dataloader:
        input_ids, attention_mask = tuple(t.to(device) for t in batch)
        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Add predictions to the DataFrame
df['predicted_cluster'] = predictions

In [None]:
df

Unnamed: 0,drugName,condition,review,predicted_sentiment,sentiment_Label,predicted_cluster
0,Mirtazapine,Depression,"""I've tried a few antidepressants over the yea...",0,Negative,0
1,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn's disease and has done very ...",2,Positive,0
2,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",1,Neutral,0
3,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",2,Positive,0
4,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",2,Positive,0
...,...,...,...,...,...,...
53466,Tamoxifen,"Breast Cancer, Prevention","""I have taken Tamoxifen for 5 years. Side effe...",0,Negative,0
53467,Escitalopram,Anxiety,"""I've been taking Lexapro (escitaploprgram) si...",2,Positive,0
53468,Levonorgestrel,Birth Control,"""I'm married, 34 years old and I have no kids....",0,Negative,0
53469,Tapentadol,Pain,"""I was prescribed Nucynta for severe neck/shou...",0,Negative,1


In [None]:
# 0: other
# 1: pain
# 2: HBP

In [None]:
# Mapping labels

def categorize_cluster(score):
    if score == 2:
        return 'HBP'
    elif score == 0:
        return 'other'
    else:
        return 'pain'

df['cluster_label'] = df['predicted_cluster'].apply(categorize_cluster)

In [None]:
df['cluster_label'].value_counts()

cluster_label
other    44387
pain      5872
HBP       3212
Name: count, dtype: int64

In [None]:
file_path = '/content/drive/MyDrive/NLP_DrugReviews/UCI_drug_review_sent_clust_labeled.csv'
df.to_csv(file_path, index=False)