## Importing necessary Imports

In [None]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import itertools #to create efficent looping to fetch more data in a go
import re 
import random 
from textblob import TextBlob

## Movie Urls

- https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled~critics:fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled,upright~critics:fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:certified_fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled~critics:certified_fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled,upright~critics:certified_fresh?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:rotten?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled~critics:rotten?page=5

- https://www.rottentomatoes.com/browse/movies_at_home/audience:spilled,upright~critics:rotten?page=5

Here we use page=5 as rottentomatoes will only allow us to check 140 movies at a time.

In [None]:
url = "https://www.rottentomatoes.com/browse/movies_at_home/audience:upright~critics:fresh?page=5"

def getSoup(url):
    """
    Utility function this get soup function will fetch the above url which stored in url var.
    """
    headers = {
        'User-Agent': 'Your User-Agent String',
        'Authorization': 'Bearer Your_Authentication_Token'  # Include this if authentication is required
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    # find div tags with class text show-more__control
    tag = review_url.find('p', attrs={'class': 'review-text'})
    return tag.getText()

def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    # find title tag
    tag = review_url.find('title')
    title_tag = list(tag.children)[0].getText()
    
    # split the title and remove the unnecessary part
    movie_title = title_tag.split(' - Movie Reviews | Rotten Tomatoes')[0]
    return movie_title


def getNounChunks(user_review):
    # create the doc object
    doc = nlp(user_review)
    # get a list of noun_chunks
    noun_chunks = list(doc.noun_chunks)
    # convert noun_chunks from span objects to strings, otherwise it won't pick
    noun_chunks_strlist = [chunk.text for chunk in noun_chunks]
    return noun_chunks_strlist

# Filtering the movie tags

In [None]:
movies_soup = getSoup(url)
movie_tags = movies_soup.find_all('a', attrs={'data-qa': "discovery-media-list-item-caption"}) + movies_soup.find_all('a', attrs={'class': "js-tile-link"})

# filter the a-tags to get just the titles
movie_links = [tag['href'] for tag in movie_tags]
# remove duplicate links
unique_movie_links = list(dict.fromkeys(movie_links))

print("There are a total of " + str(len(unique_movie_links)) + " movie titles")
print("Displaying 10 titles")
unique_movie_links[:10]

## Filtering the movie URL's

In [None]:

base_url = "https://www.rottentomatoes.com"
movie_links = [base_url + tag['href'] + '/reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 20 user reviews links")
movie_links[:20]

In [None]:
movie_soups = [getSoup(link) for link in movie_links]
# get all movie review links from the 140 listing
movie_review_list = [getReviewText(movie_soup) for movie_soup in movie_soups]

In [None]:
#Checking how many movie review were able to filter.
movie_review_list = list(itertools.chain(*movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
print(movie_review_list[:10])

## Converting into the Pandas Data Frame

In [None]:
review_texts = [getReviewText(url) for url in movie_soups]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_soups]
print(movie_titles)

# Filtering the dataframe with only User_reviews by avoiding links and title

# construct a dataframe
df = pd.DataFrame({'user_review': review_texts })

In [None]:
df.head(5) #displaying the resultant data frame

## The data frame need to remove index and filter the limit review length by 50 words

In [None]:
text_list = [m for m in df['user_review']]
#text_list

In [None]:
#calculating the length of the text
text_list_length = [len(m.split()) for m in text_list]     
df['length'] = text_list_length
df

In [None]:
df = df[df['length'] < 50]  #limiting the df by 50 in length
df

In [None]:
df.drop('length', axis=1, inplace=True)
df
#dropping the len row

In [None]:
#converting only reviews to CSV & removing the index
df.to_csv('data_scrapped/data_rotten_tomatoes.csv', index=False) 

## Splitting the csv file to the indivitual text files

In [None]:
import csv

with open("data_scrapped/data_rotten_tomatoes.csv", "r",encoding="utf-8") as f:
        reader = csv.reader(f)
        rownumber = 2639    # used to start the naming of the file , change it accordingly 
        for row in reader:
             g=open(str(rownumber)+".txt","w")
             g.write(str(row))
             rownumber = rownumber + 1
             g.close()

In [None]:
def analyze_sentiment(text):
    """
    Analyzes the sentiment of the input text.
    
    Returns:
    - 'positive' if sentiment polarity > 0
    - 'negative' if sentiment polarity < 0
    - 'neutral' if sentiment polarity == 0
    """
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Assuming df is your DataFrame containing the reviews
df['sentiment'] = df['user_review'].apply(analyze_sentiment)


In [None]:
df