In [1]:
# dependencies
import pandas as pd
import time
from datetime import date
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# export dataframe of results with today's date
today = date.today()
d_today = today.strftime("%Y_%m_%d")

## Preparation for Scrape Function

In [3]:
top_100df = pd.read_csv("output_data/Top100_Movies_2010-2021.csv")

In [4]:
# create list of years
years = top_100df["Year"].unique().tolist()
print(years)

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]


In [5]:
# use a dictionary to create multiple empty lists to store each  year's top 100 movies
obj = {}
for year in years:
    obj[year] = []
print(obj)

{2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [], 2017: [], 2018: [], 2019: [], 2020: [], 2021: []}


In [6]:
# set the year of movies we are scraping data for; each year is done individually for quality control
scrape_year = 2020

In [7]:
# fill dictionary with lists of the top 100 movies per year, display a sample year
for year in obj:
    top_100 = top_100df.loc[top_100df["Year"] == year]
    obj[year] = top_100["Release"].tolist()
print(obj[scrape_year])

['Bad Boys for Life', '1917', 'Sonic the Hedgehog', 'Jumanji: The Next Level', 'Star Wars: Episode IX - The Rise of Skywalker', 'Birds of Prey', 'Dolittle', 'Little Women', 'The Invisible Man', 'The Call of the Wild', 'Onward', 'Knives Out', 'Frozen II', 'Tenet', 'Spies in Disguise', 'The Gentlemen', 'Just Mercy', 'The Croods: A New Age', 'Parasite', 'Fantasy Island', 'Uncut Gems', 'The New Mutants', 'Like a Boss', 'The Grudge', 'Unhinged', 'The Photograph', 'The War with Grandpa', 'Underwater', 'Wonder Woman 1984', 'The Turning', 'Gretel & Hansel', 'Honest Thief', 'My Hero Academia: Heroes Rising', 'Bombshell', 'The Way Back', 'Brahms: The Boy II', 'Jojo Rabbit', 'Impractical Jokers: The Movie', 'Ford v Ferrari', 'Emma.', 'Bloodshot', 'I Still Believe', 'Come Play', 'Let Him Go', 'Freaky', 'Downhill', 'Weathering with You', 'Cats', 'The Hunt', 'The Rhythm Section', 'Monster Hunter', 'A Beautiful Day in the Neighborhood', 'Hocus Pocus2020 Re-release', 'Richard Jewell', 'The SpongeBob M

## Scrape Function Executed

In [8]:
# setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\stuhu\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache


 


In [9]:
url = "https://www.imcdb.org/"
browser.visit(url)

In [10]:
def car_mov(titles):
    movie_dict = {}
    for title in titles:
        movie_dict[title] = []
    for title in titles:
                
        print(f"Processing {title}...")
        
        browser.find_by_css('input').fill(title)
        time.sleep(1)
        browser.find_by_value('Search').click()
        time.sleep(5)
    
        html = browser.html
        soup = bs(html, 'html.parser')
    
        results = soup.find_all('div', class_='ThumbnailBox')
        if len(results) != 0:
            good = 1
        else:
            good = 0
            try:
                browser.links.find_by_text(title).click()
                time.sleep(5)
            except:
                print("No data for the movie.")
                good = 0
                continue
        
        if good == 1:
            good2 = 1
        else:
            html = browser.html
            soup = bs(html, 'html.parser')
            results = soup.find_all('div', class_='ThumbnailBox')
            if len(results) != 0:
                good2 = 1
            else:
                good2 = 0
                print("Not good at all: no car data.")
                continue
        
        if good2 == 1:
            print("Car data collected.")
            count = 0
            for result in results:
                try:
                    x = result.find('span', class_='Stars')
                    y = len(x)
                    z = result.find('a')['href']
                    info_string = result.text
                    count += 1
                    movie_dict[title].append([count, info_string, z, y])
                except:
                    y = 'Nan'
                    z = result.find('a')['href']
                    info_string = result.text
                    count += 1
                    movie_dict[title].append([count, info_string, z, y])
        else:
            pass
    browser.quit()
    print("Scraping completed.")
    return movie_dict

In [11]:
# create our dictionary of car data
top_year = obj[scrape_year]
top_yearcars = car_mov(top_year)

Processing Bad Boys for Life...
Car data collected.
Processing 1917...
Car data collected.
Processing Sonic the Hedgehog...
Car data collected.
Processing Jumanji: The Next Level...
Car data collected.
Processing Star Wars: Episode IX - The Rise of Skywalker...
Car data collected.
Processing Birds of Prey...
Car data collected.
Processing Dolittle...
No data for the movie.
Processing Little Women...
No data for the movie.
Processing The Invisible Man...
Car data collected.
Processing The Call of the Wild...
No data for the movie.
Processing Onward...
Car data collected.
Processing Knives Out...
Car data collected.
Processing Frozen II...
No data for the movie.
Processing Tenet...
Car data collected.
Processing Spies in Disguise...
Car data collected.
Processing The Gentlemen...
Car data collected.
Processing Just Mercy...
Car data collected.
Processing The Croods: A New Age...
No data for the movie.
Processing Parasite...
Car data collected.
Processing Fantasy Island...
Car data collec

In [None]:
# verification
print(top_yearcars)

## Export/Import Scraped Data for backup

In [12]:
print(len(top_yearcars))

100


In [13]:
# writing dictionary to text file
try:
    file_dict = open(f"top_{scrape_year}cars.txt", 'wt')
    file_dict.write(str(top_yearcars))
    file_dict.close()
except: 
    print("Unable to write to file")

In [None]:
# Unable to write a file? Try this instead, otherwise skip this
file_dict = open(f"top_{scrape_year}cars.txt", 'wt', encoding="utf-8")
file_dict.write(str(top_yearcars))
file_dict.close()

In [14]:
import ast

In [15]:
# read dictionary from text file back
file = open(f"top_{scrape_year}cars.txt", "r")
contents = file.read()
data = ast.literal_eval(contents)
file.close()

## Scrape Missing Data Manually

In [16]:
# setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389


 


[WDM] - Driver [C:\Users\stuhu\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache


In [17]:
url = "https://www.imcdb.org/"
browser.visit(url)

### Restart Manual Scrape Here:

In [18]:
# function to scrape data for a single film
def cars_manual(movie):
    print(f"Processing {movie}...")
    missing = f"{movie}"
    miss_dict = {}
    miss_dict[missing] = []

    html = browser.html
    soup = bs(html, 'html.parser')
    
    results = soup.find_all('div', class_='ThumbnailBox')
    if len(results) != 0:
        good = 1
    else:
        good = 0
        try:
            browser.links.find_by_text(missing).click()
            time.sleep(5)
        except:
            print("No data for the movie.")
            good = 0
        
    if good == 1:
        good2 = 1
    else:
        html = browser.html
        soup = bs(html, 'html.parser')
        results = soup.find_all('div', class_='ThumbnailBox')
        if len(results) != 0:
            good2 = 1
        else:
            good2 = 0
            print("Not good at all: no car data.")

    if good2 == 1:
        print("Data collected.")
        count = 0
        for result in results:
            try:
                x = result.find('span', class_='Stars')
                y = len(x)
                z = result.find('a')['href']
                info_string = result.text
                count += 1
                miss_dict[missing].append([count, info_string, z, y])
            except:
                y = 'Nan'
                z = result.find('a')['href']
                info_string = result.text
                count += 1
                miss_dict[missing].append([count, info_string, z, y])
    else:
        pass
    return miss_dict

In [19]:
# function to delete a single film's car data from our dictionary
def delete_data(movie):
    # remove data if no substitute can be manually found
    del top_yearcars[movie]

In [20]:
# while loop to manually update or delete films that were not scraped properly
answer = "yes"
while answer == "yes":
    decision = input("Do we want to update or delete data?: ").lower()
    if decision == "update":
        title = input("Title of the movie (exact upper and lower cases):")
        new_dict = cars_manual(title)
        # use update method to update value of errant key/title data
        top_yearcars.update(new_dict)
        # verification
        print(f"First car: {top_yearcars[title][0]}")
        print(f"Last car: {top_yearcars[title][-1]}")
        # outgoing exit option
        answer = input("Do we still have a movie to update? (yes/no): ").lower()
    elif decision == "delete":
        title = input("Title of the movie (exact upper and lower cases):")
        delete_data(title)
        answer = input("Do we still have a movie to update? (yes/no): ").lower()
    else:
        answer = "no"
print("Manual updates completed.")

Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):Birds of Prey
Processing Birds of Prey...
Data collected.
First car: [1, '2000 Bentley Arnage Red Label ', 'vehicle_1370146-Bentley-Arnage-Red-Label-2000.html', 2]
Last car: [65, '1971 Winnebago Brave ', 'vehicle_1370155-Winnebago-Brave-1971.html', 1]
Do we still have a movie to update? (yes/no): yes
Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):The Invisible Man
Processing The Invisible Man...
Data collected.
First car: [1, '1974 AMC Gremlin ', 'vehicle_1366859-AMC-Gremlin-1974.html', 1]
Last car: [26, '2009 Yamaha VMX 1700 V-Max ', 'vehicle_1316641-Yamaha-VMX-1700-V-Max-2009.html', 1]
Do we still have a movie to update? (yes/no): yes
Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):Parasite
Processing Parasite...
Data collected.
First car: [1, '2014 Audi A8 D4 ', 'vehicle_1341223-Audi-A8-Typ-4H-2014.

In [21]:
browser.quit()

## Export Data as CSV

In [None]:
print(top_yearcars)

In [22]:
# create dataframe of data for each vehicle, with movie and movie year included
title = []
year = []
auto = []
link = []
stars = []
for movie in top_year:
    try:
        cars = top_yearcars[movie]
        for car in cars:
            title.append(movie)
            year.append(scrape_year)
            auto.append(car[1])
            link.append(car[2])
            stars.append(car[3])
    except:
        print(f"No cars for the movie, {movie}.")

No cars for the movie, Underwater.
No cars for the movie, The Turning.
No cars for the movie, Downhill.


In [23]:
top_year_df = pd.DataFrame({"Release": title, "Year": year, "Car": auto, "url": link, "Stars": stars})

In [24]:
top_year_df

Unnamed: 0,Release,Year,Car,url,Stars
0,Bad Boys for Life,2020,2007 BAE Systems Caiman CMTV,vehicle_1368488-BAE-Systems-Caiman-CMTV-2007.html,2
1,Bad Boys for Life,2020,2006 BMW 3 [E90],vehicle_1368485-BMW-3-E90-2006.html,1
2,Bad Boys for Life,2020,2004 BMW 5 [E60],vehicle_1368481-BMW-5-E60-2004.html,2
3,Bad Boys for Life,2020,BMW M6 [E63],vehicle_1368482-BMW-M6-E63.html,1
4,Bad Boys for Life,2020,2003 Buick Century,vehicle_1368473-Buick-Century-2003.html,1
...,...,...,...,...,...
1417,Vanguard,2020,2007 Volvo S60,vehicle_1447514-Volvo-S60-2007.html,2
1418,Vanguard,2020,2015 Volvo XC90,vehicle_1447734-Volvo-XC90-2015.html,3
1419,Vanguard,2020,2013 WaterCar Panther,vehicle_1447595-WaterCar-Panther-2013.html,3
1420,Vanguard,2020,2012 Wright New Routemaster,vehicle_1447645-Wright-New-Routemaster-2012.html,1


In [25]:
#export as csv for use elsewhere
top_year_df.to_csv(f"output_data/{d_today}_{scrape_year}Cars.csv", index=False, header=True)