## Importing necessary Imports

In [21]:
pip install textblob

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Collecting nltk>=3.8 (from textblob)
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk>=3.8->textblob)
  Downloading regex-2024.5.15-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------  41.0/42.0 kB ? eta -:--:--
     ---------------------------------------  41.0/42.0 kB ? eta -:--:--
     -------------------------------------- 42.0/42.0 kB 406.8 kB/s eta 0:00:00
Collecting tqdm (from nltk>=3.8->textblob)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ----------------------------------- ---- 51.2/57.6 kB ? eta -:--:--
     -------------------------------------- 57.6/57.6 kB 606.6 kB/s eta 0:00:00
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)


[notice] A new release of pip is available: 23.3.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import itertools #to create efficent looping to fetch more data in a go
import re 
import random 
from textblob import TextBlob

## Movie Urls

- https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled~critics:fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled,upright~critics:fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:certified_fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled~critics:certified_fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled,upright~critics:certified_fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:rotten?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled~critics:rotten?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled,upright~critics:rotten?page=5

Here we use page=5 as rottentomatoes will only allow us to check 140 movies at a time.

In [48]:
url = "https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:fresh?page=5"

def getSoup(url):
    """
    Utility function this get soup function will fetch the above url which stored in url var.
    """
    headers = {
        'User-Agent': 'Your User-Agent String',
        'Authorization': 'Bearer Your_Authentication_Token'  # Include this if authentication is required
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def getReviewText(review_url):
    '''Returns the user review text given the review soup.'''
    tag = review_url.find('p', attrs={'class': 'review-text'})  # Use select_one for efficient CSS selector
    if tag:
        return tag.get_text(strip=True)  # Use strip=True to remove extra whitespace
    return None  # Handle case where review text is not found

def getMovieTitle(review_url):
    '''Returns the movie title from the review soup.'''
    tag = review_url.find('title')
    if tag:
        title_tag = list(tag.children)[0].get_text()
        movie_title = title_tag.split(' - Movie Reviews | Rotten Tomatoes')[0]
        return movie_title
    return None  # Handle case where title is not found


def getNounChunks(user_review):
    # create the doc object
    doc = nlp(user_review)
    # get a list of noun_chunks
    noun_chunks = list(doc.noun_chunks)
    # convert noun_chunks from span objects to strings, otherwise it won't pick
    noun_chunks_strlist = [chunk.text for chunk in noun_chunks]
    return noun_chunks_strlist

# Filtering the movie tags

In [49]:
movies_soup = getSoup(url)
movie_tags = movies_soup.find_all('a', attrs={'data-qa': "discovery-media-list-item-caption"}) + movies_soup.find_all('a', attrs={'class': "js-tile-link"})

# filter the a-tags to get just the titles
movie_links = [tag['href'] for tag in movie_tags]
# remove duplicate links
unique_movie_links = list(set(tag['href'] for tag in movie_tags))

print("There are a total of " + str(len(unique_movie_links)) + " movie titles")
print("Displaying 10 titles")
unique_movie_links[:10]

There are a total of 140 movie titles
Displaying 10 titles


['/m/the_boy_and_the_heron',
 '/m/ferrari_2023',
 '/m/civil_war_2024',
 '/m/furiosa_a_mad_max_saga',
 '/m/the_witch_2016',
 '/m/may_december',
 '/m/dream_scenario',
 '/m/a_simple_favor',
 '/m/the_beast_2023',
 '/m/brats_2024']

## Filtering the movie URL's

In [50]:

base_url = "https://www.rottentomatoes.com"
movie_links = [base_url + tag['href'] + '/reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 20 user reviews links")
movie_links[:20]

There are a total of 140 movie user reviews
Displaying 20 user reviews links


['https://www.rottentomatoes.com/m/the_bikeriders/reviews',
 'https://www.rottentomatoes.com/m/kingdom_of_the_planet_of_the_apes/reviews',
 'https://www.rottentomatoes.com/m/dragonkeeper/reviews',
 'https://www.rottentomatoes.com/m/cora_bora/reviews',
 'https://www.rottentomatoes.com/m/beverly_hills_cop_axel_f/reviews',
 'https://www.rottentomatoes.com/m/hit_man_2023/reviews',
 'https://www.rottentomatoes.com/m/the_imaginary/reviews',
 'https://www.rottentomatoes.com/m/furiosa_a_mad_max_saga/reviews',
 'https://www.rottentomatoes.com/m/a_quiet_place_2018/reviews',
 'https://www.rottentomatoes.com/m/the_beekeeper_2024/reviews',
 'https://www.rottentomatoes.com/m/the_fall_guy_2024/reviews',
 'https://www.rottentomatoes.com/m/civil_war_2024/reviews',
 'https://www.rottentomatoes.com/m/pearl_2022/reviews',
 'https://www.rottentomatoes.com/m/late_night_with_the_devil/reviews',
 'https://www.rottentomatoes.com/m/x_2022/reviews',
 'https://www.rottentomatoes.com/m/new_life_2023/reviews',
 'ht

In [51]:
movie_soups = [getSoup(link) for link in movie_links]
# get all movie review links from the 140 listing
movie_review_list = [getReviewText(movie_soup) for movie_soup in movie_soups]

In [52]:
#Checking how many movie review were able to filter.
movie_review_list = list(itertools.chain(*movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
print(movie_review_list[:10])

There are a total of 22011 individual movie reviews
Displaying 10 reviews
['M', 'o', 's', 't', ' ', 'c', 'o', 'n', 't', 'e']


## Converting into the Pandas Data Frame

In [38]:
review_texts = [getReviewText(url) for url in movie_soups]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_soups]
print(movie_titles)

# Filtering the dataframe with only User_reviews by avoiding links and title

# construct a dataframe
df = pd.DataFrame({'user_review': review_texts })

['The Bikeriders', 'Kingdom of the Planet of the Apes', 'Dragonkeeper', 'Cora Bora', 'Beverly Hills Cop: Axel F', 'Hit Man', 'The Imaginary', 'Furiosa: A Mad Max Saga', 'A Quiet Place', 'The Beekeeper', 'The Fall Guy', 'Civil War', 'Pearl', 'Late Night with the Devil', 'X', 'New Life', 'Challengers', 'Monkey Man', 'I Saw the TV Glow', 'The Ministry of Ungentlemanly Warfare', 'The Long Game', 'Inside Out', 'Fancy Dance', "The Devil's Bath", 'A Quiet Place Part II', 'Abigail', 'Godzilla Minus One', 'The First Omen', 'Beverly Hills Cop', 'Talk to Me', 'Babes', 'Aftersun', 'Fresh Kills', 'Poor Things', 'Gladiator', 'The Idea of You', 'Infested', 'Smile', 'Dune: Part Two', 'Ultraman: Rising', 'The Killer', 'Immaculate', 'Barbarian', 'The Iron Claw', 'Princess Mononoke', 'Kung Fu Panda 4', 'Oppenheimer', 'Land of Bad', 'The Beast', 'Dark Waters', 'All of Us Strangers', 'The Last Stop in Yuma County', 'His House', 'The Boy and the Heron', 'Hereditary', 'Minions: The Rise of Gru', 'Love Lies B

In [39]:
df.head(5) #displaying the resultant data frame

Unnamed: 0,user_review
0,Most contemporary westerns end up mourning a v...
1,You can nitpick some of Kingdom of the Planet ...
2,"Perhaps not surprisingly, given that the book ..."
3,"Statler, in a turn that lends a darker... is a..."
4,"Alas, forty years have passed, but the series ..."


## The data frame need to remove index and filter the limit review length by 50 words

In [40]:
text_list = [m for m in df['user_review']]
#text_list

In [41]:
#calculating the length of the text
text_list_length = [len(m.split()) for m in text_list]     
df['length'] = text_list_length
df

Unnamed: 0,user_review,length
0,Most contemporary westerns end up mourning a v...,37
1,You can nitpick some of Kingdom of the Planet ...,26
2,"Perhaps not surprisingly, given that the book ...",26
3,"Statler, in a turn that lends a darker... is a...",37
4,"Alas, forty years have passed, but the series ...",20
...,...,...
135,Rise of the Planet of the Apes has everything ...,19
136,"A smart and entertaining movie, not an importa...",9
137,The crux of this excellent thriller is the bat...,21
138,It is a beautiful example of what happens when...,21


In [42]:
df = df[df['length'] < 50]  #limiting the df by 50 in length
df

Unnamed: 0,user_review,length
0,Most contemporary westerns end up mourning a v...,37
1,You can nitpick some of Kingdom of the Planet ...,26
2,"Perhaps not surprisingly, given that the book ...",26
3,"Statler, in a turn that lends a darker... is a...",37
4,"Alas, forty years have passed, but the series ...",20
...,...,...
135,Rise of the Planet of the Apes has everything ...,19
136,"A smart and entertaining movie, not an importa...",9
137,The crux of this excellent thriller is the bat...,21
138,It is a beautiful example of what happens when...,21


In [43]:
df.drop('length', axis=1, inplace=True)
df
#dropping the len row

Unnamed: 0,user_review
0,Most contemporary westerns end up mourning a v...
1,You can nitpick some of Kingdom of the Planet ...
2,"Perhaps not surprisingly, given that the book ..."
3,"Statler, in a turn that lends a darker... is a..."
4,"Alas, forty years have passed, but the series ..."
...,...
135,Rise of the Planet of the Apes has everything ...
136,"A smart and entertaining movie, not an importa..."
137,The crux of this excellent thriller is the bat...
138,It is a beautiful example of what happens when...


In [44]:
import os

# Ensure the directory exists
os.makedirs('data_scrapped', exist_ok=True)

# Save the DataFrame to CSV
df.to_csv('data_scrapped/data_rotten_tomatoes.csv', index=False)

## Splitting the csv file to the indivitual text files

In [45]:
import csv

with open("data_scrapped/data_rotten_tomatoes.csv", "r",encoding="utf-8") as f:
        reader = csv.reader(f)
        rownumber = 2639    # used to start the naming of the file , change it accordingly 
        for row in reader:
             g=open(str(rownumber)+".txt","w")
             g.write(str(row))
             rownumber = rownumber + 1
             g.close()

In [46]:
def analyze_sentiment(text):
    """
    Analyzes the sentiment of the input text.
    
    Returns:
    - 'positive' if sentiment polarity > 0
    - 'negative' if sentiment polarity < 0
    - 'neutral' if sentiment polarity == 0
    """
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Assuming df is your DataFrame containing the reviews
df['sentiment'] = df['user_review'].apply(analyze_sentiment)


In [47]:
df

Unnamed: 0,user_review,sentiment
0,Most contemporary westerns end up mourning a v...,positive
1,You can nitpick some of Kingdom of the Planet ...,neutral
2,"Perhaps not surprisingly, given that the book ...",negative
3,"Statler, in a turn that lends a darker... is a...",positive
4,"Alas, forty years have passed, but the series ...",negative
...,...,...
135,Rise of the Planet of the Apes has everything ...,positive
136,"A smart and entertaining movie, not an importa...",positive
137,The crux of this excellent thriller is the bat...,positive
138,It is a beautiful example of what happens when...,positive
