In [5]:
!pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio===0.8.1 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [7]:
!pip install transformers requests beautifulsoup4 pandas numpy



In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re

In [3]:
# Loading in the pretrained model
# The tokenizer is converting text to a unique number for each word it analyses
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [4]:
#Testing the tokenizer
tokens = tokenizer.encode('It was alright. Nothing Special', return_tensors='pt')

In [5]:
result = model(tokens)

In [6]:
result.logits

tensor([[-1.3116,  0.7626,  2.3369,  0.6667, -2.0453]],
       grad_fn=<AddmmBackward>)

In [7]:
'''
finding the class (sentiment score) that the model predicts with the highest confidence and converting it into a 
human-readable format. The sentiment score is then obtained by adding 1 to the index because indices in programming 
usually start from 0, while sentiment scores often start from 1.
'''

int(torch.argmax(result.logits))+1

3

In [8]:
import numpy as np
import pandas as pd

In [10]:
# Testing using beautiful soup to scrape reviews for a film off Rotten Tomatoes and storing then in a dataframe 
r = requests.get('https://www.rottentomatoes.com/m/azor/reviews')
soup = BeautifulSoup(r.text, 'html.parser')
regex = re.compile('.*review-text.*')
results = soup.find_all('p', {'class':regex})
reviews = [result.text for result in results]

df = pd.DataFrame(np.array(reviews), columns=['review'])

In [11]:
print(df)

                                               review
0   Ultimately it’s a fairly straightforward, slig...
1   As in the worlds of Nicolas Winding Refn or Lu...
2   An ethical story in a context where is no ethi...
3   It's an understated, slow-burn thriller that n...
4   It's one of the year's most cohesive films, an...
5   The story unfolds at a pace that might be too ...
6   If marinating in paranoia is your idea of a go...
7   A suspenseful drama about the woes of the weal...
8   A magnificent lesson on what should be seen on...
9   Conceptually, Azor, is brilliant and its dream...
10  Restraint and alarm are the film's keynotes, w...
11  "Azor" is a film of impeccably polished facade...
12  The feeling of uneasy dread grows throughout t...
13  Fontana's movie gives us a precise account of ...
14  Fontana's sinewy debut teems with unseen threa...
15  It's a fine film, with a bleak message about h...
16  As with the concealing language of the dictato...
17  This superb debut featur

In [20]:
#Testing printing the first review in the df
df['review'].iloc[0]

'Ultimately it’s a fairly straightforward, slightly underwhelming demonstration of the banality of evil.'

In [21]:
#Turning the sentiment score production into a function 
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1


In [22]:
#Applying the sentiment score to each review (only using sentiment of the first 512 characters from the review for efficientcy)
df['sentiment'] = df['review'].apply(lambda x: sentiment_score(x[:512]))

In [23]:
df

Unnamed: 0,review,sentiment
0,"Ultimately it’s a fairly straightforward, slig...",3
1,As in the worlds of Nicolas Winding Refn or Lu...,3
2,An ethical story in a context where is no ethi...,4
3,"It's an understated, slow-burn thriller that n...",2
4,"It's one of the year's most cohesive films, an...",5
5,The story unfolds at a pace that might be too ...,3
6,If marinating in paranoia is your idea of a go...,4
7,A suspenseful drama about the woes of the weal...,4
8,A magnificent lesson on what should be seen on...,5
9,"Conceptually, Azor, is brilliant and its dream...",3


In [24]:
# Calculating the average sentiment score of all the reviews
average_sentiment = df['sentiment'].mean()

print("Average Sentiment:", average_sentiment)

Average Sentiment: 3.7


In [29]:
def extract_film_name(url):
    # Taking the film name from the url (haracters in between /m/ and / of url)
    match = re.search(r'/m/([^/]+)', url)
    if match:
        # the below line will store the film name into the film_name variable if there is a match
        film_name = match.group(1)
        # Remove underscores and using .title method to capitalize the words in the film name
        film_name = film_name.replace('_', ' ').title()
        return film_name
    else:
        return None

    #Creating a function to return the average sentiment for each film inputted as a url
def compare_sentiment_scores(urls):
    average_sentiments = []

    for url in urls:
        film_name = extract_film_name(url)

        if film_name:
            # Collect the reviews from the provided url
            r = requests.get(url)
            soup = BeautifulSoup(r.text, 'html.parser')
            regex = re.compile('.*review-text.*')
            results = soup.find_all('p', {'class': regex})
            reviews = [result.text for result in results]

            # Create a DataFrame
            df = pd.DataFrame(np.array(reviews), columns=['review'])

            # Calculate sentiment scores using the sentiment score function defined below
            df['sentiment'] = df['review'].apply(lambda x: sentiment_score(x[:512]))

            # Calculate the average sentiment to go along with it's film name and append it to the list
            average_sentiment = df['sentiment'].mean()
            average_sentiments.append((film_name, average_sentiment))

    # Sort the results for highest sentiment score first
    average_sentiments = sorted(average_sentiments, key=lambda x: x[1], reverse=True)

    return average_sentiments

#Function to extract the sentiment score. This is embedded in the function above in order to find the highest average score
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits)) + 1

# Creating a list of films to extract their review
website_urls = [
    'https://www.rottentomatoes.com/m/azor/reviews',
    'https://www.rottentomatoes.com/m/memories_of_murder_2003/reviews',
    # Add to this later 
]

#Inputing a list of film urls into the compare_sentiment_scores function and storing all this into a variable to iterate over.
average_sentiments = compare_sentiment_scores(website_urls)

for film_name, avg_sentiment in average_sentiments:
    print(f"Average Sentiment Score for {film_name}: {avg_sentiment}")

Average Sentiment Score for Memories Of Murder 2003: 3.85
Average Sentiment Score for Azor: 3.7
