In [25]:
#importing the libraries and depndables

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
#from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.edge.service import Service as EdgeService
#from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from scrapy.selector import Selector
from transformers import pipeline
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
#setting up a headless web scraping environment using the Microsoft Edge WebDriveroptions 
options.use_chromium = True
options.add_argument("--headless")  # Run in headless mode (no GUI)
driver = webdriver.Edge(options=options)

# Load the initial URL
url = 'https://www.imdb.com/title/tt0381061/reviews?ref_=tt_urv'
driver.get(url)

In [4]:
#function is designed to scroll down a web page and click a "Load More"
def scroll_and_click_load_more():
    body = driver.find_element(By.CSS_SELECTOR, 'body')
    body.send_keys(Keys.PAGE_DOWN)
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, 'load-more-trigger')))
    load_more_button = driver.find_element(By.ID, 'load-more-trigger')
    ActionChains(driver).move_to_element(load_more_button).perform()
    load_more_button.click()

# Keep clicking "Load More" until no more reviews are loaded
while True:
    try:
        scroll_and_click_load_more()
    except:
        break

# Collect review data 
rating_list = []
review_date_list = []
review_title_list = []
author_list = []
review_list = []
review_url_list = []
error_url_list = []
error_msg_list = []
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')

for d in tqdm(reviews):
    try:
        sel2 = Selector(text=d.get_attribute('innerHTML'))
        try:
            rating = sel2.css('.rating-other-user-rating span::text').extract_first()
        except:
            rating = np.NaN
        try:
            review = sel2.css('.text.show-more__control::text').extract_first()
        except:
            review = np.NaN
        try:
            review_date = sel2.css('.review-date::text').extract_first()
        except:
            review_date = np.NaN    
        try:
            author = sel2.css('.display-name-link a::text').extract_first()
        except:
            author = np.NaN    
        try:
            review_title = sel2.css('a.title::text').extract_first()
        except:
            review_title = np.NaN
        try:
            review_url = sel2.css('a.title::attr(href)').extract_first()
        except:
            review_url = np.NaN
        rating_list.append(rating)
        review_date_list.append(review_date)
        review_title_list.append(review_title)
        author_list.append(author)
        review_list.append(review)
        review_url_list.append(review_url)
    except Exception as e:
        error_url_list.append(url)
        error_msg_list.append(e)

100%|██████████████████████████████████████████████████████████████████████████████| 2606/2606 [00:32<00:00, 79.35it/s]


In [24]:
#creating a dataframe for the reviews 
review_df_CR = pd.DataFrame({
    'Review_Date': review_date_list,
    'Author': author_list,
    'Rating': rating_list,
    'Review_Title': review_title_list,
    'Review': review_list,
    'Review_Url': review_url
})

In [6]:
review_df_CR

Unnamed: 0,Review_Date,Author,Rating,Review_Title,Review,Review_Url
0,6 April 2008,planktonrules,9,"Perfect? Of course not, but it's the closest ...","For many years, I have longed to see a James B...",/review/rw1524814/?ref_=tt_urv
1,16 May 2021,Fella_shibby,9,My fav Bond in one of my fav Bond movie and t...,I first saw this in 2006 with my family in a t...,/review/rw1524814/?ref_=tt_urv
2,5 January 2009,Nazi_Fighter_David,8,"""Do I look like I give a damn?""\n",Anyone who has followed the James Bond series ...,/review/rw1524814/?ref_=tt_urv
3,25 November 2017,ivo-cobra8,10,"The best Bond i have ever seen, Daniel Craig'...",Casino Royale (2006) is without doubt one of t...,/review/rw1524814/?ref_=tt_urv
4,11 December 2006,bob the moo,,"An impressively dark, engaging and exciting e...","Having just achieved his 00 status, James Bond...",/review/rw1524814/?ref_=tt_urv
...,...,...,...,...,...,...
2601,8 May 2018,damianowski,9,Awesome movie!\n,No more or no less. It is a fantastic movie. T...,/review/rw1524814/?ref_=tt_urv
2602,18 January 2019,mariamangion,8,A stunning debut by Daniel Craig as Bond and ...,This is Daniel Craig's debut as James Bond and...,/review/rw1524814/?ref_=tt_urv
2603,8 November 2008,sygilber,10,An Imperfect James Bond\n,"As you can derive from my 10/10 rating above, ...",/review/rw1524814/?ref_=tt_urv
2604,3 December 2006,tjackson0125,10,Bond is simple and clumsy and I love it.\n,This latest version of James Bond is lacking h...,/review/rw1524814/?ref_=tt_urv


Part B: Sentiment Analysis 

In [10]:
# Initialize the sentiment analysis pipeline
sentiment_classifier = pipeline("sentiment-analysis")

# Create a copy of the DataFrame
review_df_CR_copy = review_df_CR.copy()

# Perform sentiment analysis on each review
sentiment_scores = []

# Define the maximum token length based on the model's limit
max_token_length = 512

for review_text in review_df_CR_copy['Review']:
    if len(review_text) <= max_token_length:
        # If the review text is within the token limit, analyze it directly
        sentiment_result = sentiment_classifier(review_text)[0]
        sentiment_label = sentiment_result['label']
        sentiment_scores.append(sentiment_label)
    else:
        # If the review text is too long, split it into smaller segments
        segments = [review_text[i:i+max_token_length] for i in range(0, len(review_text), max_token_length)]
        segment_sentiments = []

        for segment in segments:
            segment_result = sentiment_classifier(segment)[0]
            segment_sentiments.append(segment_result['label'])

        # Calculate the overall sentiment for the entire review based on segments
        overall_sentiment = max(set(segment_sentiments), key=segment_sentiments.count)
        sentiment_scores.append(overall_sentiment)

# Add the sentiment classification column to the DataFrame
review_df_CR_copy['sentiment'] = sentiment_scores

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [11]:
review_df_CR_copy

Unnamed: 0,Review_Date,Author,Rating,Review_Title,Review,Review_Url,sentiment
0,6 April 2008,planktonrules,9,"Perfect? Of course not, but it's the closest ...","For many years, I have longed to see a James B...",/review/rw1524814/?ref_=tt_urv,NEGATIVE
1,16 May 2021,Fella_shibby,9,My fav Bond in one of my fav Bond movie and t...,I first saw this in 2006 with my family in a t...,/review/rw1524814/?ref_=tt_urv,POSITIVE
2,5 January 2009,Nazi_Fighter_David,8,"""Do I look like I give a damn?""\n",Anyone who has followed the James Bond series ...,/review/rw1524814/?ref_=tt_urv,NEGATIVE
3,25 November 2017,ivo-cobra8,10,"The best Bond i have ever seen, Daniel Craig'...",Casino Royale (2006) is without doubt one of t...,/review/rw1524814/?ref_=tt_urv,POSITIVE
4,11 December 2006,bob the moo,,"An impressively dark, engaging and exciting e...","Having just achieved his 00 status, James Bond...",/review/rw1524814/?ref_=tt_urv,NEGATIVE
...,...,...,...,...,...,...,...
2601,8 May 2018,damianowski,9,Awesome movie!\n,No more or no less. It is a fantastic movie. T...,/review/rw1524814/?ref_=tt_urv,POSITIVE
2602,18 January 2019,mariamangion,8,A stunning debut by Daniel Craig as Bond and ...,This is Daniel Craig's debut as James Bond and...,/review/rw1524814/?ref_=tt_urv,POSITIVE
2603,8 November 2008,sygilber,10,An Imperfect James Bond\n,"As you can derive from my 10/10 rating above, ...",/review/rw1524814/?ref_=tt_urv,POSITIVE
2604,3 December 2006,tjackson0125,10,Bond is simple and clumsy and I love it.\n,This latest version of James Bond is lacking h...,/review/rw1524814/?ref_=tt_urv,POSITIVE


In [20]:
#what percentage of the reviews are +ve and -ve
total_sentiments_count = posi[tive_sentiments_count + negative_sentiments_count
positive_sentiments_count = (review_df_CR_copy['sentiment'] == 'POSITIVE').sum()
negative_sentiments_count = (review_df_CR_copy['sentiment'] == 'NEGATIVE').sum()


print(f'Total sentiments count: {total_sentiments_count}')
print(f'Positive sentiments: {positive_sentiments_count} ~ {round((positive_sentiments_count/total_sentiments_count)*100,2)}%')
print(f'Negative sentiments: {negative_sentiments_count} ~ {round((negative_sentiments_count/total_sentiments_count)*100,2)}%')


Total sentiments count: 2606
Positive sentiments: 1555 ~ 59.67%
Negative sentiments: 1051 ~ 40.33%


0          9
1          9
2          8
3         10
4       None
        ... 
2601       9
2602       8
2603      10
2604      10
2605    None
Name: Rating, Length: 2606, dtype: object