In [None]:

from google.colab import drive
drive.mount('/content/drive')


In [10]:
# Impoting Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
import warnings; warnings.simplefilter('ignore')
import nltk
nltk.download('stopwords')
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
test = pd.read_csv('/content/drive/MyDrive/NLP Project/UCIdrug_test.csv')

In [12]:
test.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4


In [13]:
def review_clean(review):
    # changing to lower case
    lower = review.str.lower()

    # Replacing the repeating pattern of '
    pattern_remove = lower.str.replace("'", "")

    # Removing all the special Characters
    special_remove = pattern_remove.str.replace(r'[^\w\d\s]',' ')

    # Removing all the non ASCII characters
    ascii_remove = special_remove.str.replace(r'[^\x00-\x7F]+',' ')

    # Removing the leading and trailing Whitespaces
    whitespace_remove = ascii_remove.str.replace(r'^\s+|\s+?$','')

    # Replacing multiple Spaces with Single Space
    multiw_remove = whitespace_remove.str.replace(r'\s+',' ')

    # Replacing Two or more dots with one
    dataframe = multiw_remove.str.replace(r'\.{2,}', ' ')

    return dataframe

In [14]:
test.loc[test['rating'] >= 7, 'Review_Sentiment'] = 'Positive'
test.loc[test['rating'].isin([5, 6]), 'Review_Sentiment'] = 'Neutral'
test.loc[test['rating'] <= 4, 'Review_Sentiment'] = 'Negative'
test['Review_Sentiment'] = test['Review_Sentiment'].replace({'Positive': 1, 'Neutral': 2, 'Negative': 0})


In [15]:
test = test[:10]


In [16]:
test['review_clean'] = review_clean(test['review'])

In [17]:
stop_words = set(stopwords.words('english'))
test['review_clean'] = test['review_clean'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
Snow_ball = SnowballStemmer("english")
test['review_clean'] = test['review_clean'].apply(lambda x: " ".join(Snow_ball.stem(word) for word in x.split()))

In [18]:
from sklearn import metrics

In [26]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('monologg/biobert_v1.1_pubmed', do_lower_case=True)
# Load the saved model
model = torch.load('/content/drive/MyDrive/NLP Project/modelbiob4.pth')


# Tokenize and encode the test data
inputs = tokenizer(test['review_clean'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Create a PyTorch DataLoader for the test data
test_dataloader = DataLoader(inputs, batch_size=64, shuffle=False)

# Make predictions on the test data
with torch.no_grad():
    predictions = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    predictions = predictions.logits
    predictions = torch.nn.functional.softmax(predictions, dim=1)
    predicted_labels = torch.argmax(predictions, dim=1)

# Convert predicted labels to strings
predicted_labels = predicted_labels.cpu().numpy()


In [27]:
precision = metrics.precision_score(test['Review_Sentiment'], predicted_labels, average='macro')
recall = metrics.recall_score(test['Review_Sentiment'], predicted_labels, average='macro')

In [31]:
print("Precision is",round(precision,2))
print("Recall is",round(recall,2))

Precision is 0.72
Recall is 0.74


In [28]:
f1 = 2 * (precision * recall) / (precision + recall)
print("F1_score is",round(f1,2))

F1_score is 0.73


In [22]:
accuracy = metrics.accuracy_score(test['Review_Sentiment'], predicted_labels)
print("Accuracy is",round(accuracy,2))

Accuracy is 0.9


Testing the Model

In [34]:
test['predicted_labels'] = predicted_labels
test

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,Review_Sentiment,review_clean,predicted_labels
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22,1,039 tri antidepress year citalopram fluoxetin ...,1
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17,1,son crohn 039 diseas done well asacol complain...,1
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3,1,quick reduct symptom,1
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35,1,contrav combin drug use alcohol smoke opioid c...,1
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4,1,birth control one cycl read review type simila...,1
5,208087,Zyclara,Keratosis,"""4 days in on first 2 weeks. Using on arms an...",4,3-Jul-14,13,0,4 day first 2 week use arm face put vaselin li...,1
6,215892,Copper,Birth Control,"""I&#039;ve had the copper coil for about 3 mon...",6,6-Jun-16,1,2,039 copper coil 3 month realli excit thought t...,2
7,169852,Amitriptyline,Migraine Prevention,"""This has been great for me. I&#039;ve been on...",9,21-Apr-09,32,1,great 039 2 week last week 3 headach went away...,2
8,23295,Methadone,Opiate Withdrawal,"""Ive been on Methadone for over ten years and ...",7,18-Oct-16,21,1,ive methadon ten year current tri get drug ive...,2
9,71428,Levora,Birth Control,"""I was on this pill for almost two years. It d...",2,16-Apr-11,3,0,pill almost two year work far get pregnant how...,0


In [29]:
inputs = tokenizer(test['review_clean'][1], padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    predictions = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    predictions = predictions.logits
    predictions = torch.nn.functional.softmax(predictions, dim=1)
    predicted_label = torch.argmax(predictions, dim=1)
predicted_label = predicted_label.cpu().numpy()
if predicted_label == 0:
    print("Negative")
elif predicted_label == 1:
    print("Positive")
else:
    print("Neutral")


Positive


In [24]:
inputs = tokenizer(test['review_clean'][5], padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    predictions = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    predictions = predictions.logits
    predictions = torch.nn.functional.softmax(predictions, dim=1)
    predicted_label = torch.argmax(predictions, dim=1)
predicted_label = predicted_label.cpu().numpy()
if predicted_label == 0:
    print("Negative")
elif predicted_label == 1:
    print("Positive")
else:
    print("Neutral")


Negative


In [30]:
inputs = tokenizer(test['review_clean'][6], padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    predictions = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    predictions = predictions.logits
    predictions = torch.nn.functional.softmax(predictions, dim=1)
    predicted_label = torch.argmax(predictions, dim=1)
predicted_label = predicted_label.cpu().numpy()
if predicted_label == 0:
    print("Negative")
elif predicted_label == 1:
    print("Positive")
else:
    print("Neutral")

Neutral
