In [1]:
# I will use the following code to get the data from the website:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk

In [2]:
# Fetching the webpage
url = 'https://www.imdb.com/title/tt0111161/reviews?ref_=tt_ql_3'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   The Shawshank Redemption (1994) - The Shawshank Redemption (1994) - User Reviews - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/title/tt0111161/reviews" rel="canonical"/>
  <

In [3]:
# Extracting review texts
review_texts = [review.text for review in soup.find_all('div', {'class': 'text show-more__control'})]
review_texts

['The Shawshank Redemption is written and directed by Frank Darabont. It is an adaptation of the Stephen King novella Rita Hayworth and Shawshank Redemption. Starring Tim Robbins and Morgan Freeman, the film portrays the story of Andy Dufresne (Robbins), a banker who is sentenced to two life sentences at Shawshank State Prison for apparently murdering his wife and her lover. Andy finds it tough going but finds solace in the friendship he forms with fellow inmate Ellis "Red" Redding (Freeman). While things start to pick up when the warden finds Andy a prison job more befitting his talents as a banker. However, the arrival of another inmate is going to vastly change things for all of them.There was no fanfare or bunting put out for the release of the film back in 94, with a title that didn\'t give much inkling to anyone about what it was about, and with Columbia Pictures unsure how to market it, Shawshank Redemption barely registered at the box office. However, come Academy Award time th

In [4]:
# Storing data in a dataframe
data = {'reviews': review_texts}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,reviews
0,The Shawshank Redemption is written and direct...
1,It is no wonder that the film has such a high ...
2,I'm trying to save you money; this is the last...
3,This movie is not your ordinary Hollywood flic...
4,"In its Oscar year, Shawshank Redemption (writt..."


In [5]:
# Analyzing sentiment of the reviews
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/shreyashgupta/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
# Ensure you are applying the analysis on the text content
df['sentiment'] = df['reviews'].apply(lambda x: sid.polarity_scores(x))
df

Unnamed: 0,reviews,sentiment
0,The Shawshank Redemption is written and direct...,"{'neg': 0.104, 'neu': 0.672, 'pos': 0.224, 'co..."
1,It is no wonder that the film has such a high ...,"{'neg': 0.012, 'neu': 0.802, 'pos': 0.187, 'co..."
2,I'm trying to save you money; this is the last...,"{'neg': 0.056, 'neu': 0.797, 'pos': 0.147, 'co..."
3,This movie is not your ordinary Hollywood flic...,"{'neg': 0.141, 'neu': 0.645, 'pos': 0.214, 'co..."
4,"In its Oscar year, Shawshank Redemption (writt...","{'neg': 0.073, 'neu': 0.729, 'pos': 0.197, 'co..."
5,The best movie in history and the best ending ...,"{'neg': 0.0, 'neu': 0.472, 'pos': 0.528, 'comp..."
6,One of the finest films made in recent years. ...,"{'neg': 0.043, 'neu': 0.763, 'pos': 0.194, 'co..."
7,Misery and Stand By Me were the best adaptatio...,"{'neg': 0.057, 'neu': 0.67, 'pos': 0.273, 'com..."
8,I've lost count of the number of times I have ...,"{'neg': 0.063, 'neu': 0.716, 'pos': 0.221, 'co..."
9,Shawshank Redemption is without doubt one of t...,"{'neg': 0.084, 'neu': 0.634, 'pos': 0.281, 'co..."


In [7]:
# # export df to csv
# df.to_csv('imdb_reviews.csv', index=False)

----

## The code below only scraped about 19 reviews but no error so going to keep it in for now.

In [7]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By  # Import the By class
from bs4 import BeautifulSoup
import time
import pandas as pd  # Import pandas for DataFrame functionality

# Initialize a browser (in this case, Chrome)
driver = webdriver.Chrome()

# Go to the page
driver.get('https://www.imdb.com/title/tt0111161/reviews')

# Scroll down to load more reviews
body = driver.find_element(By.TAG_NAME, 'body')  # Corrected line
for _ in range(8):  # Adjusted for about 200 reviews, considering 25 reviews load per scroll and initial load
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(2)  # Increased sleep to ensure the page loads

# Now you can parse the page with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
review_texts = [review.text for review in soup.find_all('div', class_='text show-more__control')]

# Putting the reviews into a DataFrame
reviews_df = pd.DataFrame(review_texts, columns=['Review'])

# Make sure to close the driver after your script is done to free up resources.
driver.close()

# You can now work with `reviews_df` DataFrame for analysis or export
print(reviews_df.head())  # For example, to print the first few reviews

                                              Review
0  The Shawshank Redemption is written and direct...
1  It is no wonder that the film has such a high ...
2  In its Oscar year, Shawshank Redemption (writt...
3  The best movie in history and the best ending ...
4  One of the finest films made in recent years. ...


In [8]:
reviews_df.to_csv('imdb_reviews.csv', index=False)  # Export to CSV

-----

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Initialize WebDriver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)

# Navigate to the IMDb reviews page
driver.get("https://www.imdb.com/title/tt0111161/reviews")

reviews = []

try:
    while len(reviews) < 200:
        # Wait for the reviews to be loaded
        wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.text.show-more__control")))
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract reviews
        new_reviews = [review.text for review in soup.find_all('div', class_='text show-more__control')]
        reviews.extend(new_reviews)
        
        # Try to find and click the "Load More" button to load more reviews
        load_more_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "ipl-load-more__button")))
        driver.execute_script("arguments[0].click();", load_more_button)

        # Safety break to prevent an infinite loop in case of unexpected page behavior
        if len(reviews) >= 200 or not load_more_button:
            break

except Exception as e:
    print(f"Scraping stopped due to: {e}")

finally:
    # Convert to DataFrame
    reviews_df = pd.DataFrame(reviews, columns=['Review'])
    print(f"Total reviews scraped: {len(reviews_df)}")

    # Save to CSV if needed
    reviews_df.to_csv("/mnt/data/imdb_reviews_scraped.csv", index=False)

    # Cleanup
    driver.quit()