In [1]:
import requests
from bs4 import BeautifulSoup
#importing beautiful soap for scrapping the data

## Dependencies

`BBeautifulSoup` :    pip install BeautifulSoup

`Google colab` :    pip install google-colab   or conda install -c conda-forge google-colab

In [2]:

import pandas as pd
import numpy as np
import itertools #to create efficent looping to fetch more data in a go
import re 
import random 

### Creating BS4 Functions for scrapping

In [3]:
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2020-01-01,2021-12-31&num_votes=20000,&count=20"

#Fetching only 20 movie listing as the processing time is great. once we are good with code we can change the above filter.
def getSoup(url):
    """
    Utility function this get soup function will fetch the above url which stored in url var.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def getReviews(soup):
    '''Function returns all reviews including postive and negative..'''
    
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           soup.find_all('span', attrs={'class': 'point-scale'})]        #can search div by inspect elementor
    
    
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})
    ans = []
    for i in range(5):
        ans.append(user_review_list[random.randint(0, len(user_review_list) -1)])
    links = ["https://www.imdb.com" + tag['href'] for tag in ans]
    return links




def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    # get the review_url's soup
    soup = getSoup(review_url)
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    return tag.getText()

def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    # get the review_url's soup
    soup = getSoup(review_url)
    # find h1 tag
    tag = soup.find('h1')
    return list(tag.children)[1].getText()

def getNounChunks(user_review):
    # create the doc object
    doc = nlp(user_review)
    # get a list of noun_chunks
    noun_chunks = list(doc.noun_chunks)
    # convert noun_chunks from span objects to strings, otherwise it won't pickle
    noun_chunks_strlist = [chunk.text for chunk in noun_chunks]
    return noun_chunks_strlist
movies_soup = getSoup(url)

In [4]:
movie_tags = movies_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[:10]

There are a total of 20 movie titles
Displaying 10 titles


['/title/tt2382320/',
 '/title/tt9421570/',
 '/title/tt8110232/',
 '/title/tt7097896/',
 '/title/tt1160419/',
 '/title/tt6264654/',
 '/title/tt3480822/',
 '/title/tt9376612/',
 '/title/tt3811906/',
 '/title/tt10954652/']

In [7]:
base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 20 user reviews links")
movie_links[:20]

There are a total of 20 movie user reviews
Displaying 20 user reviews links


['https://www.imdb.com/title/tt2382320/reviews',
 'https://www.imdb.com/title/tt9421570/reviews',
 'https://www.imdb.com/title/tt8110232/reviews',
 'https://www.imdb.com/title/tt7097896/reviews',
 'https://www.imdb.com/title/tt1160419/reviews',
 'https://www.imdb.com/title/tt6264654/reviews',
 'https://www.imdb.com/title/tt3480822/reviews',
 'https://www.imdb.com/title/tt9376612/reviews',
 'https://www.imdb.com/title/tt3811906/reviews',
 'https://www.imdb.com/title/tt10954652/reviews',
 'https://www.imdb.com/title/tt6334354/reviews',
 'https://www.imdb.com/title/tt3228774/reviews',
 'https://www.imdb.com/title/tt6654210/reviews',
 'https://www.imdb.com/title/tt9243804/reviews',
 'https://www.imdb.com/title/tt9620292/reviews',
 'https://www.imdb.com/title/tt9347730/reviews',
 'https://www.imdb.com/title/tt7737528/reviews',
 'https://www.imdb.com/title/tt5433138/reviews',
 'https://www.imdb.com/title/tt10886166/reviews',
 'https://www.imdb.com/title/tt10309902/reviews']

In [8]:
movie_soups = [getSoup(link) for link in movie_links]

# get all 500 movie review links
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]

#movie_review_list = list(itertools.chain(*movie_review_list))
#print(len(movie_review_list))

#print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
#print("Displaying 10 reviews")
#movie_review_list[:10]

In [9]:

movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:10]

100
There are a total of 100 individual movie reviews
Displaying 10 reviews


['https://www.imdb.com/review/rw7396238/',
 'https://www.imdb.com/review/rw7437543/',
 'https://www.imdb.com/review/rw7423129/',
 'https://www.imdb.com/review/rw7399244/',
 'https://www.imdb.com/review/rw7398815/',
 'https://www.imdb.com/review/rw7350535/',
 'https://www.imdb.com/review/rw7421743/',
 'https://www.imdb.com/review/rw7393947/',
 'https://www.imdb.com/review/rw7425183/',
 'https://www.imdb.com/review/rw7441337/']

In [12]:
review_texts = [getReviewText(url) for url in movie_review_list]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

# label each review with negative or positive

# construct a dataframe
df = pd.DataFrame({
             'user_review': review_texts })

In [13]:
df.head(20)

Unnamed: 0,user_review
0,The story is really engaging. It didn't feel l...
1,"It is clear right from the beginning that ""No ..."
2,"NO TIME TO DIE (2021) ***1/2 Daniel Craig, Lea..."
3,The first 20-25 minutes are superb (from the m...
4,It felt as though we were never going to get t...
5,Director Antoine Fuqua also did his best with ...
6,"""The Guilty"" is a remake of a Danish film of t..."
7,Greetings again from the darkness. Let's start...
8,My first thoughts after watching The Guilty is...
9,30 minutes in and you realize that you do not ...


In [19]:
#converting only reviews to CSV
df.to_csv('data.txt') 