In [7]:
import urllib3
import certifi
import re
import pandas as pd
import pymongo
import json

### Get connection string from file for connecting to MongoDB

In [8]:
# Opening JSON file
json_file = open('C:\\Users\\stuar\\Downloads\\MongoDBConnection.json')
  
#Grab connection string
mongo_connection_string = json.load(json_file)['mongodb']

# Closing file
json_file.close() 

### Connect to MongoDB, drop existing collection, and establish connection for interaction

In [9]:
#Connect to db
client = pymongo.MongoClient(mongo_connection_string)

# Access database
da320_database = client['metacritic']
  
# Access collection of the database
metacritic_data = da320_database['movies']

#Clear out the database (drop the collection)
metacritic_data.drop()

### Prepare expressions and http connection

In [10]:
#Construct all regular expressions
title_formula = '><h3>(.*)<\/h'                                     #Title
date_formula = 's">\s+<span>(.*)<'                                  #Release Date  
description_formula = '<div class="summary">\s*([\S\s]+?)\s*</div>' #Description
score_formula = '>(.*)<\/div>\s+<\/a>\s+<\/div>\s+<s'               #Score
image_formula = 'e-wrap.*\n.*<img src="(.*)" a'                     #Thumbnail URL

#Compile the expressions
title_regex = re.compile(title_formula)
date_regex = re.compile(date_formula)
description_regex = re.compile(description_formula)
score_regex = re.compile(score_formula)
image_regex = re.compile(image_formula)

#Construct an HTTP pool for connections
http = urllib3.PoolManager(ca_certs=certifi.where())


### Function for scraping Metacritic

In [11]:

def metacritic_scraper(year, page) -> pd.DataFrame:

    #Fetch the webpage
    url = f"https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected={year}&page={page}"
    response = http.request('GET', url, headers={'User-Agent': 'Mozilla/5.0'})
    datastring = str(response.data, "utf-8")

    #Execute all the regular expressions
    titles = title_regex.findall(datastring)
    dates = date_regex.findall(datastring)
    descriptions = description_regex.findall(datastring)
    scores = score_regex.findall(datastring)
    images = image_regex.findall(datastring)

    #Debug mismatched length
    #print("title: " , len(titles), "date: " , len(dates), "description: " , len(descriptions), "score: " , len(scores), "image: " , len(images))

    #Return a unifed collection
    dataset = {"title": titles, "date": dates, "description": descriptions, "score": scores, "image": images}
    return pd.DataFrame(dataset)



### Gather the following fields from the movie data on the Metacritic website:
- Title
- Release Date
- Description
- Metascore
- Thumbnail URL

In [12]:
import re 
import time

# Write a CSV file with this data 
for year in range(2000, 2023): 
    page = 0
    print(f"Collecting data for {year} page (page)...") 

    #Get first page
    data = metacritic_scraper(year, page) 

    #Continue until we reach a page with zero rows 
    while len(data) > 0: 
        #Convert the dataframe into a list of movies to insert into MongoDB 
        movies_to_insert = []
        for row in data.itertuples(): 
            movie = { "title": row.title, 
            "release_date": row.date,
            "description": row.description, 
            "metascore": row.score, 
            "image_url": row.image,
            }
	
            movies_to_insert.append(movie) 
        
        #Insert records into MongoDB
        print(f"Inserting {len(movies_to_insert)} movies for the year {year} page {page} ")
        metacritic_data.insert_many(movies_to_insert) 
        page = page + 1 
        #Get next page
        data = metacritic_scraper(year, page) 


Collecting data for 2000 page (page)...
Inserting 100 movies for the year 2000 page 0 
Inserting 100 movies for the year 2000 page 1 
Inserting 100 movies for the year 2000 page 2 
Inserting 65 movies for the year 2000 page 3 
Collecting data for 2001 page (page)...
Inserting 100 movies for the year 2001 page 0 
Inserting 100 movies for the year 2001 page 1 
Inserting 100 movies for the year 2001 page 2 
Inserting 85 movies for the year 2001 page 3 
Collecting data for 2002 page (page)...
Inserting 100 movies for the year 2002 page 0 
Inserting 100 movies for the year 2002 page 1 
Inserting 100 movies for the year 2002 page 2 
Inserting 100 movies for the year 2002 page 3 
Inserting 30 movies for the year 2002 page 4 
Collecting data for 2003 page (page)...
Inserting 100 movies for the year 2003 page 0 
Inserting 100 movies for the year 2003 page 1 
Inserting 100 movies for the year 2003 page 2 
Inserting 100 movies for the year 2003 page 3 
Inserting 9 movies for the year 2003 page 4 