In [1]:
# dependencies
import pandas as pd
import time
from datetime import date
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# export dataframe of results with today's date
today = date.today()
d_today = today.strftime("%Y_%m_%d")

## Preparation for Scrape Function

In [3]:
top_100df = pd.read_csv("output_data/Top100_Movies_2010-2021.csv")

In [4]:
# create list of years
years = top_100df["Year"].unique().tolist()
print(years)

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]


In [5]:
# use a dictionary to create multiple empty lists to store each  year's top 100 movies
obj = {}
for year in years:
    obj[year] = []
print(obj)

{2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [], 2017: [], 2018: [], 2019: [], 2020: [], 2021: []}


In [6]:
# set the year of movies we are scraping data for; each year is done individually for quality control
scrape_year = 2015

In [7]:
# fill dictionary with lists of the top 100 movies per year, display a sample year
for year in obj:
    top_100 = top_100df.loc[top_100df["Year"] == year]
    obj[year] = top_100["Release"].tolist()
print(obj[scrape_year])

['Jurassic World', 'Star Wars: Episode VII - The Force Awakens', 'Avengers: Age of Ultron', 'Inside Out', 'Furious 7', 'American Sniper', 'Minions', 'The Hunger Games: Mockingjay - Part 2', 'The Martian', 'Cinderella', 'Spectre', 'Mission: Impossible - Rogue Nation', 'Pitch Perfect 2', 'Ant-Man', 'Home', 'Hotel Transylvania 2', 'Fifty Shades of Grey', 'The SpongeBob Movie: Sponge Out of Water', 'Straight Outta Compton', 'San Andreas', 'Mad Max: Fury Road', 'The Divergent Series: Insurgent', 'Kingsman: The Secret Service', 'The Peanuts Movie', 'Spy', 'The Good Dinosaur', 'Trainwreck', 'Creed', 'Tomorrowland', 'Get Hard', 'Terminator Genisys', 'Taken 3', 'Maze Runner: The Scorch Trials', 'Ted 2', 'Goosebumps', 'Pixels', 'Paddington', 'The Intern', 'The Imitation Game', 'Paul Blart: Mall Cop 2', 'Bridge of Spies', 'War Room', 'Magic Mike XXL', 'The Hobbit: The Battle of the Five Armies', 'The Visit', "Daddy's Home", 'The Wedding Ringer', 'Into the Woods', 'Black Mass', 'Vacation', 'The Pe

## Scrape Function Executed

In [8]:
# setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324


 


[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/88.0.4324.96/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\stuhu\.wdm\drivers\chromedriver\win32\88.0.4324.96]


In [9]:
url = "https://www.imcdb.org/"
browser.visit(url)

In [10]:
def car_mov(titles):
    movie_dict = {}
    for title in titles:
        movie_dict[title] = []
    for title in titles:
                
        print(f"Processing {title}...")
        
        browser.find_by_css('input').fill(title)
        time.sleep(1)
        browser.find_by_value('Search').click()
        time.sleep(5)
    
        html = browser.html
        soup = bs(html, 'html.parser')
    
        results = soup.find_all('div', class_='ThumbnailBox')
        if len(results) != 0:
            good = 1
        else:
            good = 0
            try:
                browser.links.find_by_text(title).click()
                time.sleep(5)
            except:
                print("No data for the movie.")
                good = 0
                continue
        
        if good == 1:
            good2 = 1
        else:
            html = browser.html
            soup = bs(html, 'html.parser')
            results = soup.find_all('div', class_='ThumbnailBox')
            if len(results) != 0:
                good2 = 1
            else:
                good2 = 0
                print("Not good at all: no car data.")
                continue
        
        if good2 == 1:
            print("Car data collected.")
            count = 0
            for result in results:
                try:
                    x = result.find('span', class_='Stars')
                    y = len(x)
                    z = result.find('a')['href']
                    info_string = result.text
                    count += 1
                    movie_dict[title].append([count, info_string, z, y])
                except:
                    y = 'Nan'
                    z = result.find('a')['href']
                    info_string = result.text
                    count += 1
                    movie_dict[title].append([count, info_string, z, y])
        else:
            pass
    browser.quit()
    print("Scraping completed.")
    return movie_dict

In [11]:
# create our dictionary of car data
top_year = obj[scrape_year]
top_yearcars = car_mov(top_year)

Processing Jurassic World...
Car data collected.
Processing Star Wars: Episode VII - The Force Awakens...
No data for the movie.
Processing Avengers: Age of Ultron...
Car data collected.
Processing Inside Out...
Car data collected.
Processing Furious 7...
Car data collected.
Processing American Sniper...
Car data collected.
Processing Minions...
Car data collected.
Processing The Hunger Games: Mockingjay - Part 2...
Car data collected.
Processing The Martian...
Car data collected.
Processing Cinderella...
No data for the movie.
Processing Spectre...
Car data collected.
Processing Mission: Impossible - Rogue Nation...
Car data collected.
Processing Pitch Perfect 2...
Car data collected.
Processing Ant-Man...
Car data collected.
Processing Home...
No data for the movie.
Processing Hotel Transylvania 2...
Car data collected.
Processing Fifty Shades of Grey...
Car data collected.
Processing The SpongeBob Movie: Sponge Out of Water...
Car data collected.
Processing Straight Outta Compton...

In [None]:
# verification
print(top_yearcars)

## Export/Import Scraped Data for backup

In [12]:
print(len(top_yearcars))

100


In [13]:
# writing dictionary to text file
try:
    file_dict = open(f"top_{scrape_year}cars.txt", 'wt')
    file_dict.write(str(top_yearcars))
    file_dict.close()
except: 
    print("Unable to write to file")

In [None]:
# Unable to write a file? Try this instead, otherwise skip this
file_dict = open(f"top_{scrape_year}cars.txt", 'wt', encoding="utf-8")
file_dict.write(str(top_yearcars))
file_dict.close()

In [14]:
import ast

In [15]:
# read dictionary from text file back
file = open(f"top_{scrape_year}cars.txt", "r")
contents = file.read()
data = ast.literal_eval(contents)
file.close()

## Scrape Missing Data Manually

In [16]:
# setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324


 


[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/88.0.4324.96/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\stuhu\.wdm\drivers\chromedriver\win32\88.0.4324.96]


In [17]:
url = "https://www.imcdb.org/"
browser.visit(url)

### Restart Manual Scrape Here:

In [18]:
# function to scrape data for a single film
def cars_manual(movie):
    print(f"Processing {movie}...")
    missing = f"{movie}"
    miss_dict = {}
    miss_dict[missing] = []

    html = browser.html
    soup = bs(html, 'html.parser')
    
    results = soup.find_all('div', class_='ThumbnailBox')
    if len(results) != 0:
        good = 1
    else:
        good = 0
        try:
            browser.links.find_by_text(missing).click()
            time.sleep(5)
        except:
            print("No data for the movie.")
            good = 0
        
    if good == 1:
        good2 = 1
    else:
        html = browser.html
        soup = bs(html, 'html.parser')
        results = soup.find_all('div', class_='ThumbnailBox')
        if len(results) != 0:
            good2 = 1
        else:
            good2 = 0
            print("Not good at all: no car data.")

    if good2 == 1:
        print("Data collected.")
        count = 0
        for result in results:
            try:
                x = result.find('span', class_='Stars')
                y = len(x)
                z = result.find('a')['href']
                info_string = result.text
                count += 1
                miss_dict[missing].append([count, info_string, z, y])
            except:
                y = 'Nan'
                z = result.find('a')['href']
                info_string = result.text
                count += 1
                miss_dict[missing].append([count, info_string, z, y])
    else:
        pass
    return miss_dict

In [19]:
# function to delete a single film's car data from our dictionary
def delete_data(movie):
    # remove data if no substitute can be manually found
    del top_yearcars[movie]

In [20]:
# while loop to manually update or delete films that were not scraped properly
answer = "yes"
while answer == "yes":
    decision = input("Do we want to update or delete data?: ").lower()
    if decision == "update":
        title = input("Title of the movie (exact upper and lower cases):")
        new_dict = cars_manual(title)
        # use update method to update value of errant key/title data
        top_yearcars.update(new_dict)
        # verification
        print(f"First car: {top_yearcars[title][0]}")
        print(f"Last car: {top_yearcars[title][-1]}")
        # outgoing exit option
        answer = input("Do we still have a movie to update? (yes/no): ").lower()
    elif decision == "delete":
        title = input("Title of the movie (exact upper and lower cases):")
        delete_data(title)
        answer = input("Do we still have a movie to update? (yes/no): ").lower()
    else:
        answer = "no"
print("Manual updates completed.")

Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):Inside Out
Processing Inside Out...
Data collected.
First car: [1, 'Chevrolet C-10 ', 'vehicle_848908-Chevrolet-C-10.html', 1]
Last car: [5, '1988 Yugo GV ', 'vehicle_848910-Yugo-GV-102-1988.html', 1]
Do we still have a movie to update? (yes/no): yes
Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):Spectre
Processing Spectre...
Data collected.
First car: [1, '2004 Alfa Romeo 166 ', 'vehicle_820776-Alfa-Romeo-166-936-2004.html', 1]
Last car: [68, '2015 Yamaha Grizzly 700 ', 'vehicle_879472-Yamaha-Grizzly-700-2015.html', 2]
Do we still have a movie to update? (yes/no): yes
Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):Home
Processing Home...
Data collected.
First car: [1, 'Piaggio Vespa ', 'vehicle_815908-Piaggio-Vespa.html', 1]
Last car: [5, 'Volkswagen Rabbit I [Typ 17] ', 'vehicle_815904-Volkswagen-Ra

In [21]:
browser.quit()

## Export Data as CSV

In [None]:
print(top_yearcars)

In [22]:
# create dataframe of data for each vehicle, with movie and movie year included
title = []
year = []
auto = []
link = []
stars = []
for movie in top_year:
    try:
        cars = top_yearcars[movie]
        for car in cars:
            title.append(movie)
            year.append(scrape_year)
            auto.append(car[1])
            link.append(car[2])
            stars.append(car[3])
    except:
        print(f"No cars for the movie, {movie}.")

No cars for the movie, Ex Machina.


In [23]:
top_year_df = pd.DataFrame({"Release": title, "Year": year, "Car": auto, "url": link, "Stars": stars})

In [24]:
top_year_df

Unnamed: 0,Release,Year,Car,url,Stars
0,Jurassic World,2015,AM General HMMWV M1025,vehicle_824044-AM-General-HMMWV-M1025.html,1
1,Jurassic World,2015,Baja Motorsports MB200,vehicle_805726-Baja-Motorsports-MB200.html,1
2,Jurassic World,2015,1997 Chevrolet C-2500,vehicle_824041-Chevrolet-C-2500-1997.html,2
3,Jurassic World,2015,Hyster,vehicle_1154866-Hyster.html,1
4,Jurassic World,2015,1992 Jeep Wrangler,vehicle_820203-Jeep-Wrangler-YJ-1992.html,3
...,...,...,...,...,...
2673,The Gallows,2015,1992 Honda Civic,vehicle_853343-Honda-Civic-EG8-1992.html,1
2674,The Gallows,2015,1996 Honda Civic,vehicle_853344-Honda-Civic-EJ-1996.html,2
2675,The Gallows,2015,Mazda MX-5 Miata,vehicle_853346-Mazda-MX-5-Miata-NA.html,1
2676,The Gallows,2015,2007 Nissan Altima,vehicle_853351-Nissan-Altima-L32-2007.html,1


In [25]:
#export as csv for use elsewhere
top_year_df.to_csv(f"output_data/{d_today}_{scrape_year}Cars.csv", index=False, header=True)