In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from sklearn.metrics import accuracy_score
from torch.nn.utils.rnn import pad_sequence
import torch.multiprocessing as mp




In [4]:
df = pd.read_csv('medical_data.csv')

In [5]:
df.isna().sum()


uniqueID                  0
drugName                  0
condition                 0
review                    0
rating                    0
date                      0
usefulCount               0
lengthReview              0
conditionCluster_label    0
drugNameCluster_label     0
dtype: int64

In [6]:
df['condition'] = df['condition'].dropna()

In [7]:
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8,27-Apr-10,192,712,2,4
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5,14-Dec-09,17,708,9,6
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8,3-Nov-15,10,428,9,4
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9,27-Nov-16,37,669,0,2
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2,28-Nov-15,43,373,0,5


In [None]:


# Parameters
max_len = 128
batch_size = 32
num_classes = 2

# Preprocess text
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
reviews = df['review'].apply(lambda x: tokenizer.encode_plus(x, truncation=True, max_length=max_len, padding='max_length', return_attention_mask=True))

# Extract features
input_ids = torch.tensor([seq['input_ids'] for seq in reviews], dtype=torch.long)
attention_masks = torch.tensor([seq['attention_mask'] for seq in reviews], dtype=torch.long)
ratings = torch.tensor(df['rating'].values, dtype=torch.float).unsqueeze(1)
useful_counts = torch.tensor(df['usefulCount'].values, dtype=torch.float).unsqueeze(1)

# Create dataset
dataset = TensorDataset(input_ids, attention_masks, ratings, useful_counts)

# Create dataloader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Initialize model
bert = AutoModel.from_pretrained('bert-base-uncased')

class SentimentModel(nn.Module):
    def __init__(self, bert, num_classes):
        super(SentimentModel, self).__init__()
        self.bert = bert
        self.fc = nn.Linear(bert.config.hidden_size + 2, num_classes)

    def forward(self, input_ids, attention_mask, ratings, counts):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        combined_features = torch.cat([pooled_output, ratings, counts], dim=1)
        logits = self.fc(combined_features)
        return logits

model = SentimentModel(bert, num_classes)

# Set the model to evaluation mode
model.eval()

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluate on GPU
all_preds = []

with torch.no_grad():
    for batch in dataloader:
        inputs, attention_mask, ratings, counts = batch
        inputs, attention_mask, ratings, counts = inputs.to(device), attention_mask.to(device), ratings.to(device), counts.to(device)
        logits = model(inputs, attention_mask, ratings, counts)
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

# Assign predicted sentiments to the DataFrame
df['predicted_sentiment'] = all_preds

# Print or save the DataFrame with the predicted sentiment column
print(df[['review', 'predicted_sentiment']])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch

sentiment_df = df.head(1000)
# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

# Define function to predict sentiment
def predict_sentiment(text):
  encoded_input = tokenizer(text, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  predicted_class_id = scores.argmax()
  return model.config.id2label[predicted_class_id]

# Apply to dataframe column
sentiment_df['predicted_sentiment'] = sentiment_df['review'].apply(predict_sentiment)

# Get counts per sentiment
print(df['predicted_sentiment'].value_counts())

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: The expanded size of the tensor (676) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 676].  Tensor sizes: [1, 514]

In [None]:
from keras.preprocessing.sequence import pad_sequences
# Set max sequence length
max_len = 676

# Tokenize and pad review text
def prepare_text(text):

  encoded = tokenizer(text, padding='max_length', truncation=True, max_length=max_len)

  encoded['input_ids'] = pad_sequences(encoded['input_ids'],
                                      maxlen=max_len,
                                      dtype='long',
                                      truncating='post',
                                      padding='post')

  input_ids = encoded['input_ids'].reshape(1,max_len)

  return input_ids

# Apply pre-processing function
sentiment_df['input_ids'] = sentiment_df['review'].apply(prepare_text)

# Predict sentiment
sentiment_df['predicted_sentiment'] = model(sentiment_df['input_ids']).argmax(1)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
import pandas as pd

# Set up model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

# Define constants
MAX_LEN = 676

# Preprocess text function
def prepare_text(text):
  encoded = tokenizer.encode_plus(text,
                                add_special_tokens=True,
                                max_length=MAX_LEN,
                                pad_to_max_length=True,
                                return_attention_mask=True,
                                return_tensors='pt')

  return encoded['input_ids'], encoded['attention_mask']

# Sample dataframe
data = pd.DataFrame({'text': ['I really enjoyed that movie!', 'This is the worst thing ever.']})

# Preprocess text
input_ids, attention_mask = [], []

for txt in data['text']:
  ids, mask = prepare_text(txt)
  input_ids.append(ids)
  attention_mask.append(mask)

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_mask = torch.cat(attention_mask, dim=0)

# Feed through model
outputs = model(input_ids, attention_mask=attention_mask)

# Get sentiment predictions
sentiment = torch.argmax(outputs.logits, dim=1)
print(sentiment)

In [None]:
sentiment_df = df.head(500)

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch



# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

# Parameters
MAX_LEN = 512
BATCH_SIZE = 32



# Lists to store tokenized data
input_ids = []
attention_masks = []

# Iterate through reviews
for review in sentiment_df['review']:

  # Encode review
  encoded = tokenizer(review, max_length=MAX_LEN, padding="max_length", truncation=True)

  # Append to lists
  input_ids.append(encoded['input_ids'])
  attention_masks.append(encoded['attention_mask'])

# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

# Make predictions in batches
predictions = []

for i in range(0, len(input_ids), BATCH_SIZE):
  start = i
  end = i + BATCH_SIZE
  batch = model(input_ids[start:end], attention_mask=attention_masks[start:end])
  batch_preds = torch.argmax(batch.logits, dim=-1)
  predictions.extend(batch_preds)

# Add predictions to dataframe
sentiment_df['predicted_sentiment'] = predictions

print(df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

       uniqueID                 drugName                     condition  \
0         95260               Guanfacine                          ADHD   
1         92703                   Lybrel                 Birth Control   
2        138000               Ortho Evra                 Birth Control   
3         35696  Buprenorphine  naloxone             Opiate Dependence   
4        155963                   Cialis  Benign Prostatic Hyperplasia   
...         ...                      ...                           ...   
95906    103458                 Tekturna           High Blood Pressure   
95907    191035                  Campral            Alcohol Dependence   
95908    127085           Metoclopramide                NauseaVomiting   
95909     47128       Thyroid desiccated           Underactive Thyroid   
95910    215220             Lubiprostone          Constipation Chronic   

                                                  review  rating       date  \
0      My son is halfway through

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentiment_df['predicted_sentiment'] = predictions


In [None]:
sentiment_df[['review','predicted_sentiment']]

Unnamed: 0,review,predicted_sentiment
0,"""I&#039;ve tried a few antidepressants over th...",tensor(0)
1,"""My son has Crohn&#039;s disease and has done ...",tensor(2)
2,"""Quick reduction of symptoms""",tensor(1)
3,"""Contrave combines drugs that were used for al...",tensor(1)
4,"""I have been on this birth control for one cyc...",tensor(2)
...,...,...
495,"""My husband has RA. His doctor sent him to ...",tensor(1)
496,"""Well just have start by saying so far so good...",tensor(2)
497,"""Very effective for fibromyalgia pain. Does no...",tensor(1)
498,"""Great.""",tensor(2)


In [None]:
# Mappings
mappings = {'tensor(0)': 'positive', 'tensor(1)': 'neutral', 'tensor(2)': 'negative'}

# Convert predictions
string_predictions = [mappings[str(pred)] for pred in predictions]

# Add to dataframe
sentiment_df['predicted_sentiment'] = string_predictions

# Print samples
print(sentiment_df['predicted_sentiment'].sample(5))

442     neutral
159    negative
364    negative
377    negative
409    negative
Name: predicted_sentiment, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentiment_df['predicted_sentiment'] = string_predictions


In [None]:
sentiment_df[['review','predicted_sentiment']]

Unnamed: 0,review,predicted_sentiment
0,My son is halfway through his fourth week of I...,neutral
1,I used to take another oral contraceptive whic...,neutral
2,This is my first time using any form of birth ...,negative
3,Suboxone has completely turned my life around ...,negative
4,2nd day on 5mg started to work with rock hard ...,positive
...,...,...
495,My doc switched me to this due to some issues ...,positive
496,Took this for about 4 years and had throat swe...,positive
497,I was diagnosed with Chronic Myelogenous Leuke...,negative
498,Ativan is a great medicine and glad my psychia...,negative


In [None]:
from google.colab import files

# Save the merged DataFrame to a CSV file
sentiment_df.to_csv("sentiment.csv")

# Download the CSV file to your local machine
files.download("sentiment.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn

# Parameters
max_len = 128
batch_size = 32
num_classes = 2
learning_rate = 2e-5
epochs = 5

# Load tokenizer and tokenize reviews
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
reviews = df['review'].apply(lambda x: tokenizer.encode_plus(x, truncation=True, max_length=max_len, padding='max_length', return_attention_mask=True))

input_ids = torch.tensor([seq['input_ids'] for seq in reviews], dtype=torch.long)
attention_masks = torch.tensor([seq['attention_mask'] for seq in reviews], dtype=torch.long)

# Create dataset and dataloader
dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load pretrained BERT model
bert_model = AutoModel.from_pretrained('bert-base-uncased')

# Model architecture
class SentimentModel(nn.Module):

    def __init__(self, bert, num_classes):
        super(SentimentModel, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        output = self.dropout(pooled_output)
        return self.classifier(output)

# Initialize the model, loss func, optimizer
model = SentimentModel(bert_model, num_classes=num_classes)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    for batch in dataloader:
        input_ids, attention_mask = tuple(t.to(device) for t in batch)
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluation
with torch.no_grad():
    for batch in dataloader:
        input_ids, attention_mask = tuple(t.to(device) for t in batch)
        logits = model(input_ids, attention_mask)
        # compute metrics