In [1]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.model_selection import train_test_split

# Load the data
train_df = pd.read_csv('drugsComTrain_raw.tsv', delimiter='\t')
test_df = pd.read_csv('drugsComTest_raw.tsv', delimiter='\t')

# Define a function to classify sentiment based on review text
def classify_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

# Apply the sentiment classification
for df in [train_df, test_df]:
    df['sentiment'] = df['review'].apply(classify_sentiment)
    df['positive'] = df['sentiment'] == 'positive'
    df['negative'] = df['sentiment'] == 'negative'
    df['neutral'] = df['sentiment'] == 'neutral'

# Convert sentiment labels to a single numerical column
def sentiment_to_label(sentiment):
    if sentiment == 'positive':
        return 2
    elif sentiment == 'neutral':
        return 1
    else: # negative
        return 0

for df in [train_df, test_df]:
    df['sentiment_label'] = df['sentiment'].apply(sentiment_to_label)

# check results
train_df.head()
test_df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,sentiment,positive,negative,neutral,sentiment_label
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22,neutral,False,False,True,1
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17,positive,True,False,False,2
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3,positive,True,False,False,2
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35,positive,True,False,False,2
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4,positive,True,False,False,2


In [2]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the reviews
def tokenize_reviews(data, max_length=256):
    return tokenizer(data['review'].tolist(), max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')

train_tokens = tokenize_reviews(train_df)
test_tokens = tokenize_reviews(test_df)


In [3]:
from torch.utils.data import Dataset, DataLoader

class DrugReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}  
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long) 
        return item

    def __len__(self):
        return len(self.labels)

train_labels = train_df['sentiment_label'].tolist()
test_labels = test_df['sentiment_label'].tolist()

train_dataset = DrugReviewDataset(train_tokens, train_labels)
test_dataset = DrugReviewDataset(test_tokens, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [4]:
from sklearn.metrics import accuracy_score
import numpy as np

def evaluate(model, dataloader):
    model.eval()
    total_eval_accuracy = 0
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        labels = batch['labels']
        total_eval_accuracy += (predictions == labels).float().mean()
    return total_eval_accuracy / len(dataloader)


In [5]:
from torch.optim import AdamW
from transformers import BertForSequenceClassification
import torch

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    avg_train_loss = total_loss / len(train_loader)
    val_accuracy = evaluate(model, test_loader)
    print(f'Epoch: {epoch+1}/{epochs}, Loss: {avg_train_loss}, Validation Accuracy: {val_accuracy}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [6]:
def predict_sentiment(model, dataloader):
    model.eval()
    review_sentiments = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            review_sentiments.extend(predictions.cpu().numpy())

    return review_sentiments

# Example usage (predicting sentiments for the test dataset):
test_predictions = predict_sentiment(model, test_loader)


In [8]:
# predicted sentiment for test data
test_df['predictedSentiment'] = test_predictions

# Calculate scores for each drug per condition
drug_scores = test_df.groupby(['condition', 'drugName'])['predictedSentiment'].mean().reset_index()

# Rank drugs for each condition based on their score
drug_rankings = drug_scores.sort_values(['condition', 'predictedSentiment'], ascending=[True, False])

# Select top 5 drugs for each condition
top_drugs_per_condition = drug_rankings.groupby('condition').head(5)

print(top_drugs_per_condition)

                                       condition      drugName  \
0     0</span> users found this comment helpful.        Aviane   
1     0</span> users found this comment helpful.       Chantix   
2     0</span> users found this comment helpful.  Depo-Provera   
3     0</span> users found this comment helpful.        Drysol   
5     0</span> users found this comment helpful.      Implanon   
...                                          ...           ...   
5558                                 zen Shoulde      Naproxen   
5559                                 zen Shoulde       Relafen   
5556                                 zen Shoulde         Aleve   
5557                                 zen Shoulde    Diclofenac   
5560                                 zen Shoulde      Voltaren   

      predictedSentiment  
0                    2.0  
1                    2.0  
2                    2.0  
3                    2.0  
5                    2.0  
...                  ...  
5558              

In [9]:
top_drugs_per_condition.to_csv("C:/Users/chira/Downloads/top_drugs_per_condition.csv")