In [36]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import time
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer
import torch
import re
import string

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [7]:
rslt = 'https://www.tripadvisor.com/Restaurant_Review-g189117-d7989747-Reviews-Delhi_Darbar-Lagos_Faro_District_Algarve.html'

In [11]:
def getAndParseURL(url):
    result = requests.get(url, headers={
        "User-Agent": "Safari/537.36",
    })
    # headers={"User-Agent":"Chrome/119.0.6045.105 "}
    soup = BeautifulSoup(result.text, 'html.parser')
    return soup

In [12]:
result = []

html = getAndParseURL(rslt)

In [13]:
html

<html><head><title>tripadvisor.com</title><style>#cmsg{animation: A 1.5s;}@keyframes A{0%{opacity:0;}99%{opacity:0;}100%{opacity:1;}}</style></head><body style="margin:0"><p id="cmsg">Please enable JS and disable any ad blocker</p><script data-cfasync="false">var dd={'rt':'c','cid':'AHrlqAAAAAMAvQE30PtI4LEAnTPH9w==','hsh':'2F05D671381DB06BEE4CC52C7A6FD3','t':'fe','s':46694,'e':'2a2bc508b97bff0d9629eec83091a97c20e48a0ad82fedcfbd37c286eb672673','host':'geo.captcha-delivery.com'}</script><script data-cfasync="false" src="https://ct.captcha-delivery.com/c.js"></script></body></html>

In [4]:
try:
    div_elements = html.find_all("div", {"class": "entry"})
    time.sleep(1)
    # print(div_elements)

    reviews = [div.find("p", class_="partial_entry").text for div in div_elements]
    time.sleep(1)
    # print(reviews)
    
except:
    print("Except block is called!")
    reviews = [np.nan]

result.extend([reviews])

In [5]:
try:
    div_elements = html.find_all("div", {"class": "ui_column is-9"})
    time.sleep(1)
    # print(div_elements)

    ratings = [str(div.find('span', {'class': 'ui_bubble_rating'}).get('class', [''])[-1][7]) for div in div_elements if div.find('span', {'class': 'ui_bubble_rating'}) is not None]
    time.sleep(1)
    # print(ratings)
    
except:
    print("Except block is called!")
    ratings = [np.nan]

result.extend([ratings])

result = [list(item) for item in zip(*result)]

In [None]:
result

['We’ve been coming to this restaurant periodically for about 15 years.  The last time we came was about 4/5 years ago and at that time, we said never again because we thought overall quality/price lacked which we confirmed this time around.  Their wine menu is...very overpriced. Nowhere in Rome do we ever pay €35 for a bottle of wine I think they’re taking advantage of tourists who might pay double that in the US when you go out for a meal.   The only reason we came back was because we felt like having a good caccio e Pepe but that wasn’t good, in fact when served it was not even warm and within a few bites cold.  Probably should’ve complained but we didn’t.  Everything else we had was acceptable. Service was fine but again quality/price just aren’t there and there are plenty of other places to eat where we don’t regret our meal.More',
 'Truly bizarre experience. I would agree with many of the reviews that if you are English tourists I would avoid. I say this as the American couple be

In [47]:
reviews = result

In [37]:
url_pattern = re.compile(r'https?://\S+|www\.\S+')
punctuation_pattern = re.compile('[%s]' % re.escape(string.punctuation))
newline_pattern = re.compile('\n')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tag_dict = {'J': wordnet.ADJ,'N': wordnet.NOUN,'V': wordnet.VERB,'R': wordnet.ADV}

In [38]:
def preprocess_text(text):
    text = str(text).lower()
    clean_text = re.sub('https?://\S+|www\.\S+', '', text) #URLs
    clean_text = re.sub('[%s]' % re.escape(string.punctuation), '', clean_text) #Punctuations
    clean_text = re.sub('\n', '', clean_text) #Backslash n
    clean_text = [word for word in clean_text.split(' ') if word not in stop_words] #Remove stopwords
    # clean_text =" ".join(clean_text)
    # tokens = clean_text.split()
    pos_tags = nltk.pos_tag(clean_text)
    wordnet_tags = [(token, tag_dict.get(tag[0].upper(), wordnet.NOUN)) for token, tag in pos_tags]
    tokens = [lemmatizer.lemmatize(token, tag) for token, tag in wordnet_tags] #Lemmatize
    return " ".join(tokens)

In [41]:
clean_review = [preprocess_text(review) for review in reviews]

In [29]:
model_dir = "RoBERTa_weights"
model_path = f"{model_dir}/model.pth"
tokenizer_path = f"{model_dir}/tokenizer"

In [26]:
class Roberta(torch.nn.Module):
    def __init__(self):
        super(Roberta, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [28]:
model = Roberta()

model.load_state_dict(torch.load(model_path))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [30]:
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

In [43]:
previews = []
nreviews = []

In [44]:
for review in clean_review:
     inputs = tokenizer(review, return_tensors="pt", padding=True, truncation=True)
     input_ids = inputs['input_ids']
     attention_mask = inputs['attention_mask']

     with torch.no_grad():
          outputs = model1(input_ids, attention_mask, None)

     probs = torch.nn.functional.softmax(outputs, dim=-1)
     predicted_label = torch.argmax(probs, dim=-1).item()
     if (predicted_label == 0):
          nreviews.append(review)
     else:
          previews.append(review)

In [45]:
nreviews

['we’ve come restaurant periodically 15 year  last time come 45 year ago time say never think overall qualityprice lack confirmed time around  wine menu isvery overprice nowhere rome ever pay €35 bottle wine think they’re take advantage tourist might pay double u go meal   reason come back felt like good caccio e pepe wasn’t good fact serve even warm within bite cold  probably should’ve complain didn’t  everything else acceptable service fine qualityprice aren’t plenty place eat don’t regret mealmore',
 'truly bizarre experience would agree many review english tourist would avoid say american couple behind u seem lovely albeit contrast experience ourswe make reservation monthin advance arrive time queue outside wait make way frosty reception hostwe sit next toilet wait menu finally waiter come tried order drink tell food waiter rush order insist roast lamb say “and time yes yes”  sort nod bemusement order antipasti cacio e pepe lamb ask half portion lasagne tell possiblethe next thing 

In [46]:
previews

['really want taste one  classic pasta dish  rom  felice taste cacio e pepe die   one best roman cusine city  ',
 'love plate savour amaze ambience roof absolutely amazing  come back gracie',
 'one best roman restaurant around cacio pepe abbacchio lamb amaze doesn’t matter get end liking',
 'absolutely live hype come first night time cacio e pepe luxurious extremely high quality also meatballs surprise hit especially dip bread sauce it’s charm placereally enjoy get cacio e pepe won’t regret come stir right table melt cheesei make reservation well advance i’m sure easy would walk get table really want eat make one reservation essential hit place want ask sit patio november still comfortable heat lamp it’s fun sit street therevery helpful staff well ready speak italian try lol speak english help place rome like recommend try italianmore',
 'best food ate rome excellent value bustle modern vibe service little unfriendly side though',
 'much enjoy meal especially tiramisu outstanding ate l