## Here we will use an unseen dataset to run and assess our sentiment analysis model.

### Read more about the data  [here](https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018)


In [None]:
!unzip "/content/UCI_drug_review.zip"

Archive:  /content/UCI_drug_review.zip
  inflating: drugsComTest_raw.csv    
  inflating: drugsComTrain_raw.csv   


In [None]:
import pandas as pd
df = pd.read_csv("/content/drugsComTest_raw.csv")
df

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4
...,...,...,...,...,...,...,...
53761,159999,Tamoxifen,"Breast Cancer, Prevention","""I have taken Tamoxifen for 5 years. Side effe...",10,13-Sep-14,43
53762,140714,Escitalopram,Anxiety,"""I&#039;ve been taking Lexapro (escitaploprgra...",9,8-Oct-16,11
53763,130945,Levonorgestrel,Birth Control,"""I&#039;m married, 34 years old and I have no ...",8,15-Nov-10,7
53764,47656,Tapentadol,Pain,"""I was prescribed Nucynta for severe neck/shou...",1,28-Nov-11,20


In [None]:
df.isnull().sum()

uniqueID         0
drugName         0
condition      295
review           0
rating           0
date             0
usefulCount      0
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
# Texts contain html characters so we have to convert them back to original like below
import html

text = df['review'][0]

clean_text = html.unescape(text)
print(clean_text)


"I've tried a few antidepressants over the years (citalopram, fluoxetine, amitriptyline), but none of those helped with my depression, insomnia & anxiety. My doctor suggested and changed me onto 45mg mirtazapine and this medicine has saved my life. Thankfully I have had no side effects especially the most common - weight gain, I've actually lost alot of weight. I still have suicidal thoughts but mirtazapine has saved me."


In [None]:
import html

def decode_html(text):
    return html.unescape(text)

# Apply the function to the entire 'review' column
df['review'] = df['review'].apply(decode_html)

In [None]:
len(df['drugName'].unique())

2635

In [None]:
len(df['condition'].unique())

708

In [None]:
# Drop unnecessary columns
df.drop(columns=['uniqueID', 'rating', 'usefulCount','date'], inplace=True)

In [None]:
# Preprocess the review texts

In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
X = df["review"]

max_length = 128
tokenized_texts = [tokenizer.tokenize(review)[:max_length] for review in X]

In [None]:
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]

In [None]:
from torch.nn.utils.rnn import pad_sequence

# Convert tokenized input sequences to PyTorch tensors
input_ids = [torch.tensor(ids) for ids in input_ids]

# Pad sequences for train and test sets
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)

In [None]:
attention_masks = torch.tensor([[float(i != 0) for i in ii] for ii in input_ids])

In [None]:
num_classes = 3

# redefine Sentiment Analysis model (exactly as it was for training)
class SentimentClassifier(nn.Module):
    def __init__(self, pretrained_model):
        super(SentimentClassifier, self).__init__()
        self.distilbert = pretrained_model
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.distilbert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)[0]  # Only take the last hidden state
        output = self.drop(output[:, 0])  # Take the [CLS] token's representation (first token)
        return self.out(output)

# Load pre-trained model
pretrained_distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Instantiate model
model = SentimentClassifier(pretrained_distilbert_model)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# Load trained model weights
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP_DrugReviews/drug_review_sentiment_model.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def predict_in_batches(input_ids, attention_masks, model, batch_size=16):
    model.eval()  # Set model to evaluation mode
    predictions = []

    # Ensure everything is on the correct device
    model = model.to(device)

    with torch.no_grad():
        for i in range(0, len(input_ids), batch_size):
            batch_input_ids = input_ids[i:i+batch_size].to(device)
            batch_attention_masks = attention_masks[i:i+batch_size].to(device)

            outputs = model(batch_input_ids, batch_attention_masks)
            batch_predictions = torch.argmax(outputs, dim=1)
            predictions.append(batch_predictions.cpu())

    # Concatenate all batch predictions
    predictions = torch.cat(predictions)
    return predictions.numpy()

# Assuming input_ids and attention_masks are already prepared
predictions = predict_in_batches(input_ids, attention_masks, model, batch_size=16)


df['predicted_sentiment'] = predictions

In [None]:
df

Unnamed: 0,drugName,condition,review,predicted_sentiment
0,Mirtazapine,Depression,"""I've tried a few antidepressants over the yea...",0
1,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn's disease and has done very ...",2
2,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",1
3,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",2
4,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",2
...,...,...,...,...
53761,Tamoxifen,"Breast Cancer, Prevention","""I have taken Tamoxifen for 5 years. Side effe...",0
53762,Escitalopram,Anxiety,"""I've been taking Lexapro (escitaploprgram) si...",2
53763,Levonorgestrel,Birth Control,"""I'm married, 34 years old and I have no kids....",0
53764,Tapentadol,Pain,"""I was prescribed Nucynta for severe neck/shou...",0


In [None]:
# Mapping labels

def categorize_sentiment(score):
    if score == 2:
        return 'Positive'
    elif score == 0:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment_Label'] = df['predicted_sentiment'].apply(categorize_sentiment)

In [None]:
df['sentiment_Label'].value_counts()

sentiment_Label
Positive    26643
Negative    26066
Neutral       762
Name: count, dtype: int64

## As you can see the model has perfectly labeled the reviews. This shows that our model is performing well on unseen data and it can be used to label any drug related text data with a reasonable accuracy.

In [None]:
# Saving the dataset for future use
file_path = '/content/drive/MyDrive/NLP_DrugReviews/UCI_drug_review_sentiment_labeled.csv'
df.to_csv(file_path, index=False)