In [1]:
# dependencies
import pandas as pd
import time
from datetime import date
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# export dataframe of results with today's date
today = date.today()
d_today = today.strftime("%Y_%m_%d")

## Preparation for Scrape Function

In [3]:
top_100df = pd.read_csv("output_data/Top100_Movies_2010-2021.csv")

In [4]:
# create list of years
years = top_100df["Year"].unique().tolist()
print(years)

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]


In [5]:
# use a dictionary to create multiple empty lists to store each  year's top 100 movies
obj = {}
for year in years:
    obj[year] = []
print(obj)

{2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [], 2017: [], 2018: [], 2019: [], 2020: [], 2021: []}


In [6]:
# set the year of movies we are scraping data for; each year is done individually for quality control
scrape_year = 2018

In [7]:
# fill dictionary with lists of the top 100 movies per year, display a sample year
for year in obj:
    top_100 = top_100df.loc[top_100df["Year"] == year]
    obj[year] = top_100["Release"].tolist()
print(obj[scrape_year])

['Black Panther', 'Avengers: Infinity War', 'Incredibles 2', 'Jurassic World: Fallen Kingdom', 'Deadpool 2', 'The Grinch', 'Jumanji: Welcome to the Jungle', 'Mission: Impossible - Fallout', 'Ant-Man and the Wasp', 'Solo: A Star Wars Story', 'Venom', 'A Star Is Born', 'Aquaman', 'Bohemian Rhapsody', 'A Quiet Place', 'Ralph Breaks the Internet', 'Crazy Rich Asians', 'Hotel Transylvania 3: Summer Vacation', 'Halloween', 'Fantastic Beasts: The Crimes of Grindelwald', 'The Meg', "Ocean's Eight", 'Ready Player One', 'The Greatest Showman', 'Mamma Mia! Here We Go Again', 'The Nun', 'Peter Rabbit', 'Creed II', 'Spider-Man: Into the Spider-Verse', 'Mary Poppins Returns', 'Star Wars: Episode VIII - The Last Jedi', 'The Equalizer 2', 'Rampage', 'A Wrinkle in Time', 'Fifty Shades Freed', 'Christopher Robin', 'I Can Only Imagine', 'Smallfoot', 'The Post', 'Night School', 'Bumblebee', 'The First Purge', 'Game Night', 'Book Club', 'The House with a Clock in Its Walls', 'Skyscraper', 'Insidious: The L

## Scrape Function Executed

In [8]:
# setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324


 


[WDM] - Driver [C:\Users\stuhu\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


In [9]:
url = "https://www.imcdb.org/"
browser.visit(url)

In [10]:
def car_mov(titles):
    movie_dict = {}
    for title in titles:
        movie_dict[title] = []
    for title in titles:
                
        print(f"Processing {title}...")
        
        browser.find_by_css('input').fill(title)
        time.sleep(1)
        browser.find_by_value('Search').click()
        time.sleep(5)
    
        html = browser.html
        soup = bs(html, 'html.parser')
    
        results = soup.find_all('div', class_='ThumbnailBox')
        if len(results) != 0:
            good = 1
        else:
            good = 0
            try:
                browser.links.find_by_text(title).click()
                time.sleep(5)
            except:
                print("No data for the movie.")
                good = 0
                continue
        
        if good == 1:
            good2 = 1
        else:
            html = browser.html
            soup = bs(html, 'html.parser')
            results = soup.find_all('div', class_='ThumbnailBox')
            if len(results) != 0:
                good2 = 1
            else:
                good2 = 0
                print("Not good at all: no car data.")
                continue
        
        if good2 == 1:
            print("Car data collected.")
            count = 0
            for result in results:
                try:
                    x = result.find('span', class_='Stars')
                    y = len(x)
                    z = result.find('a')['href']
                    info_string = result.text
                    count += 1
                    movie_dict[title].append([count, info_string, z, y])
                except:
                    y = 'Nan'
                    z = result.find('a')['href']
                    info_string = result.text
                    count += 1
                    movie_dict[title].append([count, info_string, z, y])
        else:
            pass
    browser.quit()
    print("Scraping completed.")
    return movie_dict

In [11]:
# create our dictionary of car data
top_year = obj[scrape_year]
top_yearcars = car_mov(top_year)

Processing Black Panther...
Car data collected.
Processing Avengers: Infinity War...
Car data collected.
Processing Incredibles 2...
Car data collected.
Processing Jurassic World: Fallen Kingdom...
Car data collected.
Processing Deadpool 2...
Car data collected.
Processing The Grinch...
No data for the movie.
Processing Jumanji: Welcome to the Jungle...
Car data collected.
Processing Mission: Impossible - Fallout...
Car data collected.
Processing Ant-Man and the Wasp...
Car data collected.
Processing Solo: A Star Wars Story...
No data for the movie.
Processing Venom...
Car data collected.
Processing A Star Is Born...
Car data collected.
Processing Aquaman...
Car data collected.
Processing Bohemian Rhapsody...
Car data collected.
Processing A Quiet Place...
Car data collected.
Processing Ralph Breaks the Internet...
Car data collected.
Processing Crazy Rich Asians...
Car data collected.
Processing Hotel Transylvania 3: Summer Vacation...
No data for the movie.
Processing Halloween...
Ca

In [None]:
# verification
print(top_yearcars)

## Export/Import Scraped Data for backup

In [12]:
print(len(top_yearcars))

100


In [13]:
# writing dictionary to text file
try:
    file_dict = open(f"top_{scrape_year}cars.txt", 'wt')
    file_dict.write(str(top_yearcars))
    file_dict.close()
except: 
    print("Unable to write to file")

Unable to write to file


In [14]:
# Unable to write a file? Try this instead, otherwise skip this
file_dict = open(f"top_{scrape_year}cars.txt", 'wt', encoding="utf-8")
file_dict.write(str(top_yearcars))
file_dict.close()

In [15]:
import ast

In [16]:
# read dictionary from text file back
file = open(f"top_{scrape_year}cars.txt", "r")
contents = file.read()
data = ast.literal_eval(contents)
file.close()

## Scrape Missing Data Manually

In [17]:
# setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\stuhu\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


 


In [18]:
url = "https://www.imcdb.org/"
browser.visit(url)

### Restart Manual Scrape Here:

In [19]:
# function to scrape data for a single film
def cars_manual(movie):
    print(f"Processing {movie}...")
    missing = f"{movie}"
    miss_dict = {}
    miss_dict[missing] = []

    html = browser.html
    soup = bs(html, 'html.parser')
    
    results = soup.find_all('div', class_='ThumbnailBox')
    if len(results) != 0:
        good = 1
    else:
        good = 0
        try:
            browser.links.find_by_text(missing).click()
            time.sleep(5)
        except:
            print("No data for the movie.")
            good = 0
        
    if good == 1:
        good2 = 1
    else:
        html = browser.html
        soup = bs(html, 'html.parser')
        results = soup.find_all('div', class_='ThumbnailBox')
        if len(results) != 0:
            good2 = 1
        else:
            good2 = 0
            print("Not good at all: no car data.")

    if good2 == 1:
        print("Data collected.")
        count = 0
        for result in results:
            try:
                x = result.find('span', class_='Stars')
                y = len(x)
                z = result.find('a')['href']
                info_string = result.text
                count += 1
                miss_dict[missing].append([count, info_string, z, y])
            except:
                y = 'Nan'
                z = result.find('a')['href']
                info_string = result.text
                count += 1
                miss_dict[missing].append([count, info_string, z, y])
    else:
        pass
    return miss_dict

In [20]:
# function to delete a single film's car data from our dictionary
def delete_data(movie):
    # remove data if no substitute can be manually found
    del top_yearcars[movie]

In [21]:
# while loop to manually update or delete films that were not scraped properly
answer = "yes"
while answer == "yes":
    decision = input("Do we want to update or delete data?: ").lower()
    if decision == "update":
        title = input("Title of the movie (exact upper and lower cases):")
        new_dict = cars_manual(title)
        # use update method to update value of errant key/title data
        top_yearcars.update(new_dict)
        # verification
        print(f"First car: {top_yearcars[title][0]}")
        print(f"Last car: {top_yearcars[title][-1]}")
        # outgoing exit option
        answer = input("Do we still have a movie to update? (yes/no): ").lower()
    elif decision == "delete":
        title = input("Title of the movie (exact upper and lower cases):")
        delete_data(title)
        answer = input("Do we still have a movie to update? (yes/no): ").lower()
    else:
        answer = "no"
print("Manual updates completed.")

Do we want to update or delete data?: delete
Title of the movie (exact upper and lower cases):The Grinch
Do we still have a movie to update? (yes/no): yes
Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):Venom
Processing Venom...
Data collected.
First car: [1, '1999 Acura Integra ', 'vehicle_1136076-Acura-Integra-DC4-1999.html', 1]
Last car: [77, '2001 Volvo S60 ', 'vehicle_1222737-Volvo-S60-2001.html', 2]
Do we still have a movie to update? (yes/no): yes
Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):A Star Is Born
Processing A Star Is Born...
Data collected.
First car: [1, '2014 BMW X5 [F15] ', 'vehicle_1229541-BMW-X5-F15-2014.html', 1]
Last car: [8, 'Toyota Prius II ', 'vehicle_1229540-Toyota-Prius-NHW20.html', 2]
Do we still have a movie to update? (yes/no): yes
Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):Halloween
Processing Halloween...
Dat

In [22]:
browser.quit()

## Export Data as CSV

In [None]:
print(top_yearcars)

In [23]:
# create dataframe of data for each vehicle, with movie and movie year included
title = []
year = []
auto = []
link = []
stars = []
for movie in top_year:
    try:
        cars = top_yearcars[movie]
        for car in cars:
            title.append(movie)
            year.append(scrape_year)
            auto.append(car[1])
            link.append(car[2])
            stars.append(car[3])
    except:
        print(f"No cars for the movie, {movie}.")

No cars for the movie, The Grinch.
No cars for the movie, A Wrinkle in Time.
No cars for the movie, Robin Hood.
No cars for the movie, Hostiles.


In [24]:
top_year_df = pd.DataFrame({"Release": title, "Year": year, "Car": auto, "url": link, "Stars": stars})

In [25]:
top_year_df

Unnamed: 0,Release,Year,Car,url,Stars
0,Black Panther,2018,2016 Bentley Continental GTC,vehicle_1115905-Bentley-Continental-GTC-2016.html,1
1,Black Panther,2018,2012 BMW 3 [F30],vehicle_1140467-BMW-3-F30-2012.html,1
2,Black Panther,2018,1998 Chevrolet Prizm,vehicle_1140476-Chevrolet-Prizm-E110-1998.html,1
3,Black Panther,2018,2011 Chevrolet Spark,vehicle_1140466-Chevrolet-Spark-M300-2011.html,1
4,Black Panther,2018,1992 Daewoo LeMans,vehicle_1115893-Daewoo-LeMans-1992.html,1
...,...,...,...,...,...
2445,The Hate U Give,2018,2012 Nissan Maxima,vehicle_1227836-Nissan-Maxima-A35-2012.html,1
2446,The Hate U Give,2018,2000 Pontiac Bonneville,vehicle_1227839-Pontiac-Bonneville-2000.html,1
2447,The Hate U Give,2018,2007 Pontiac Solstice GXP,vehicle_1227841-Pontiac-Solstice-GXP-GMX020-20...,2
2448,The Hate U Give,2018,2003 Toyota Avalon,vehicle_1227828-Toyota-Avalon-MCX20-2003.html,1


In [26]:
#export as csv for use elsewhere
top_year_df.to_csv(f"output_data/{d_today}_{scrape_year}Cars.csv", index=False, header=True)