In [1]:
# dependencies
import pandas as pd
import time
from datetime import date
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# export dataframe of results with today's date
today = date.today()
d_today = today.strftime("%Y_%m_%d")

## Preparation for Scrape Function

In [3]:
top_100df = pd.read_csv("output_data/Top100_Movies_2010-2021.csv")

In [4]:
# create list of years
years = top_100df["Year"].unique().tolist()
print(years)

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]


In [5]:
# use a dictionary to create multiple empty lists to store each  year's top 100 movies
obj = {}
for year in years:
    obj[year] = []
print(obj)

{2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [], 2017: [], 2018: [], 2019: [], 2020: [], 2021: []}


In [6]:
# set the year of movies we are scraping data for; each year is done individually for quality control
scrape_year = 2012

In [7]:
# fill dictionary with lists of the top 100 movies per year, display a sample year
for year in obj:
    top_100 = top_100df.loc[top_100df["Year"] == year]
    obj[year] = top_100["Release"].tolist()
print(obj[scrape_year])

['The Avengers', 'The Dark Knight Rises', 'The Hunger Games', 'Skyfall', 'The Twilight Saga: Breaking Dawn - Part 2', 'The Amazing Spider-Man', 'Brave', 'The Hobbit: An Unexpected Journey', 'Ted', "Madagascar 3: Europe's Most Wanted", 'The Lorax', 'Men in Black 3', 'Wreck-It Ralph', 'Ice Age: Continental Drift', 'Snow White and the Huntsman', 'Hotel Transylvania', 'Taken 2', '21 Jump Street', 'Lincoln', 'Prometheus', 'Safe House', 'The Vow', 'Magic Mike', 'The Bourne Legacy', 'Argo', 'Journey 2: The Mysterious Island', 'Flight', 'Think Like a Man', 'Rise of the Guardians', 'Mission: Impossible - Ghost Protocol', 'The Campaign', 'Life of Pi', 'The Expendables 2', 'Wrath of the Titans', 'Dark Shadows', 'John Carter', 'Les Misérables', 'Act of Valor', 'Django Unchained', 'Contraband', 'Looper', "Madea's Witness Protection", 'Battleship', 'Mirror Mirror', 'Chronicle', 'Pitch Perfect', 'Hope Springs', 'Sherlock Holmes: A Game of Shadows', 'Underworld: Awakening', 'The Lucky One', 'The Dicta

## Scrape Function Executed

In [8]:
# setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\stuhu\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


 


In [9]:
url = "https://www.imcdb.org/"
browser.visit(url)

In [10]:
def car_mov(titles):
    movie_dict = {}
    for title in titles:
        movie_dict[title] = []
    for title in titles:
                
        print(f"Processing {title}...")
        
        browser.find_by_css('input').fill(title)
        time.sleep(1)
        browser.find_by_value('Search').click()
        time.sleep(5)
    
        html = browser.html
        soup = bs(html, 'html.parser')
    
        results = soup.find_all('div', class_='ThumbnailBox')
        if len(results) != 0:
            good = 1
        else:
            good = 0
            try:
                browser.links.find_by_text(title).click()
                time.sleep(5)
            except:
                print("No data for the movie.")
                good = 0
                continue
        
        if good == 1:
            good2 = 1
        else:
            html = browser.html
            soup = bs(html, 'html.parser')
            results = soup.find_all('div', class_='ThumbnailBox')
            if len(results) != 0:
                good2 = 1
            else:
                good2 = 0
                print("Not good at all: no car data.")
                continue
        
        if good2 == 1:
            print("Car data collected.")
            count = 0
            for result in results:
                try:
                    x = result.find('span', class_='Stars')
                    y = len(x)
                    z = result.find('a')['href']
                    info_string = result.text
                    count += 1
                    movie_dict[title].append([count, info_string, z, y])
                except:
                    y = 'Nan'
                    z = result.find('a')['href']
                    info_string = result.text
                    count += 1
                    movie_dict[title].append([count, info_string, z, y])
        else:
            pass
    browser.quit()
    print("Scraping completed.")
    return movie_dict

In [11]:
# create our dictionary of car data
top_year = obj[scrape_year]
top_yearcars = car_mov(top_year)

Processing The Avengers...
Car data collected.
Processing The Dark Knight Rises...
Car data collected.
Processing The Hunger Games...
Car data collected.
Processing Skyfall...
Car data collected.
Processing The Twilight Saga: Breaking Dawn - Part 2...
Car data collected.
Processing The Amazing Spider-Man...
Car data collected.
Processing Brave...
Car data collected.
Processing The Hobbit: An Unexpected Journey...
No data for the movie.
Processing Ted...
No data for the movie.
Processing Madagascar 3: Europe's Most Wanted...
Car data collected.
Processing The Lorax...
No data for the movie.
Processing Men in Black 3...
Car data collected.
Processing Wreck-It Ralph...
Car data collected.
Processing Ice Age: Continental Drift...
No data for the movie.
Processing Snow White and the Huntsman...
No data for the movie.
Processing Hotel Transylvania...
Car data collected.
Processing Taken 2...
Car data collected.
Processing 21 Jump Street...
Car data collected.
Processing Lincoln...
No data fo

In [13]:
# verification
print(top_yearcars)

{'The Avengers': [[1, '1965 AC 428 Spider  Ep. 7.11+', 'vehicle_13910-AC-428-Spider-CF1-1965.html', 3], [2, '1954 AC Ace  Ep. 7.32', 'vehicle_285618-AC-Ace-1954.html', 1], [3, '1960 AC Greyhound  Ep. 2.09', 'vehicle_221947-AC-Greyhound-1960.html', 3], [4, '1959 AEC Mandator MkV  Ep. 4.17', 'vehicle_258229-AEC-Mandator-1959.html', 1], [5, 'AEC Regal  Ep. 7.18', 'vehicle_304972-AEC-Regal.html', 1], [6, 'AEC Regal  Ep. 7.18', 'vehicle_304973-AEC-Regal.html', 1], [7, 'AEC Regal III  Ep. 7.18', 'vehicle_304968-AEC-Regal-III.html', 1], [8, '1953 AEC Regal IV RF  Ep. 4.17', 'vehicle_258220-AEC-Regal-IV-RF-1953.html', 1], [9, '1954 AEC Regal IV RF  Ep. 4.17', 'vehicle_258219-AEC-Regal-IV-RF-1954.html', 1], [10, '1951 AEC Regent III  Ep. 4.25', 'vehicle_259839-AEC-Regent-III-1951.html', 2], [11, '1964 AEC Reliance  Ep. 7.12', 'vehicle_287327-AEC-Reliance-1964.html', 1], [12, 'AEC RT  Ep. 4.04', 'vehicle_252697-AEC-RT.html', 1], [13, '1964 Alfa Romeo Giulia Sprint GT  Ep. 7.12', 'vehicle_287325-

## Export/Import Scraped Data for backup

In [18]:
print(len(top_yearcars))

100


In [20]:
# writing dictionary to text file
try:
    file_dict = open(f"top_{scrape_year}cars.txt", 'wt')
    file_dict.write(str(top_yearcars))
    file_dict.close()
except: 
    print("Unable to write to file")

Unable to write to file


In [None]:
# Unable to write a file? Try this instead, otherwise skip this
file_dict = open(f"top_{scrape_year}cars.txt", 'wt', encoding="utf-8")
file_dict.write(str(top_yearcars))
file_dict.close()

In [27]:
import ast

In [28]:
# read dictionary from text file back
file = open(f"top_{scrape_year}cars.txt", "r")
contents = file.read()
data = ast.literal_eval(contents)
file.close()

## Scrape Missing Data Manually

In [29]:
# setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324


 


[WDM] - Driver [C:\Users\stuhu\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


In [30]:
url = "https://www.imcdb.org/"
browser.visit(url)

### Restart Manual Scrape Here:

In [31]:
# function to scrape data for a single film
def cars_manual(movie):
    print(f"Processing {movie}...")
    missing = f"{movie}"
    miss_dict = {}
    miss_dict[missing] = []

    html = browser.html
    soup = bs(html, 'html.parser')
    
    results = soup.find_all('div', class_='ThumbnailBox')
    if len(results) != 0:
        good = 1
    else:
        good = 0
        try:
            browser.links.find_by_text(missing).click()
            time.sleep(5)
        except:
            print("No data for the movie.")
            good = 0
        
    if good == 1:
        good2 = 1
    else:
        html = browser.html
        soup = bs(html, 'html.parser')
        results = soup.find_all('div', class_='ThumbnailBox')
        if len(results) != 0:
            good2 = 1
        else:
            good2 = 0
            print("Not good at all: no car data.")

    if good2 == 1:
        print("Data collected.")
        count = 0
        for result in results:
            try:
                x = result.find('span', class_='Stars')
                y = len(x)
                z = result.find('a')['href']
                info_string = result.text
                count += 1
                miss_dict[missing].append([count, info_string, z, y])
            except:
                y = 'Nan'
                z = result.find('a')['href']
                info_string = result.text
                count += 1
                miss_dict[missing].append([count, info_string, z, y])
    else:
        pass
    return miss_dict

In [32]:
# function to delete a single film's car data from our dictionary
def delete_data(movie):
    # remove data if no substitute can be manually found
    del top_yearcars[movie]

In [34]:
# while loop to manually update or delete films that were not scraped properly
answer = "yes"
while answer == "yes":
    decision = input("Do we want to update or delete data?: ").lower()
    if decision == "update":
        title = input("Title of the movie (exact upper and lower cases):")
        new_dict = cars_manual(title)
        # use update method to update value of errant key/title data
        top_yearcars.update(new_dict)
        # verification
        print(f"First car: {top_yearcars[title][0]}")
        print(f"Last car: {top_yearcars[title][-1]}")
        # outgoing exit option
        answer = input("Do we still have a movie to update? (yes/no): ").lower()
    elif decision == "delete":
        title = input("Title of the movie (exact upper and lower cases):")
        delete_data(title)
        answer = input("Do we still have a movie to update? (yes/no): ").lower()
    else:
        answer = "no"
print("Manual updates completed.")

Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):The Avengers
Processing The Avengers...
Data collected.
First car: [1, '2010 Acura MDX ', 'vehicle_840237-Acura-MDX-YD2-2010.html', 'Nan']
Last car: [68, '1998 Volvo V70 XC Cross Country ', 'vehicle_525935-Volvo-V70-XC-Cross-Country-1998.html', 1]
Do we still have a movie to update? (yes/no): yes
Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):The Amazing Spider-Man
Processing The Amazing Spider-Man...
Data collected.
First car: [1, '2010 Aston Martin Rapide ', 'vehicle_501908-Aston-Martin-Rapide-2010.html', 1]
Last car: [58, 'Westward Go-4 Interceptor ', 'vehicle_553150-Westward-Go-4-Interceptor.html', 1]
Do we still have a movie to update? (yes/no): yes
Do we want to update or delete data?: update
Title of the movie (exact upper and lower cases):Brave
Processing Brave...
Data collected.
First car: [1, '1979 Toyota Truck ', 'vehicle_743204-Toyota-

In [35]:
browser.quit()

## Export Data as CSV

In [36]:
print(top_yearcars)

{'The Avengers': [[1, '2010 Acura MDX ', 'vehicle_840237-Acura-MDX-YD2-2010.html', 'Nan'], [2, '2010 Acura MDX ', 'vehicle_1235723-Acura-MDX-YD2-2010.html', 'Nan'], [3, '2012 Acura MDX ', 'vehicle_525889-Acura-MDX-YD2-2012.html', 1], [4, '2012 Acura NSX Roadster Concept ', 'vehicle_525885-Acura-NSX-Roadster-Concept-2012.html', 2], [5, '2013 Acura RDX ', 'vehicle_525890-Acura-RDX-TB4-2013.html', 1], [6, '2010 Acura TL ', 'vehicle_525924-Acura-TL-UA8-2010.html', 1], [7, '2012 Acura TL ', 'vehicle_525891-Acura-TL-UA8-2012.html', 2], [8, 'AM General HMMWV ', 'vehicle_525911-AM-General-HMMWV.html', 2], [9, 'AM General HMMWV ', 'vehicle_525912-AM-General-HMMWV.html', 1], [10, 'AM General M-35 ', 'vehicle_1235724-AM-General-M-35.html', 'Nan'], [11, '2011 BMW 5 [F10] ', 'vehicle_525886-BMW-5-F10-2011.html', 1], [12, '1997 Buick Century ', 'vehicle_1235728-Buick-Century-1997.html', 'Nan'], [13, '2003 Chevrolet Cavalier ', 'vehicle_825415-Chevrolet-Cavalier-GM-J-2003.html', 'Nan'], [14, '2000 Ch

In [37]:
# create dataframe of data for each vehicle, with movie and movie year included
title = []
year = []
auto = []
link = []
stars = []
for movie in top_year:
    try:
        cars = top_yearcars[movie]
        for car in cars:
            title.append(movie)
            year.append(scrape_year)
            auto.append(car[1])
            link.append(car[2])
            stars.append(car[3])
    except:
        print(f"No cars for the movie, {movie}.")

No cars for the movie, Wreck-It Ralph.
No cars for the movie, Hotel Transylvania.
No cars for the movie, John Carter.
No cars for the movie, Les Misérables.
No cars for the movie, Mirror Mirror.


In [38]:
top_year_df = pd.DataFrame({"Release": title, "Year": year, "Car": auto, "url": link, "Stars": stars})

In [39]:
top_year_df

Unnamed: 0,Release,Year,Car,url,Stars
0,The Avengers,2012,2010 Acura MDX,vehicle_840237-Acura-MDX-YD2-2010.html,Nan
1,The Avengers,2012,2010 Acura MDX,vehicle_1235723-Acura-MDX-YD2-2010.html,Nan
2,The Avengers,2012,2012 Acura MDX,vehicle_525889-Acura-MDX-YD2-2012.html,1
3,The Avengers,2012,2012 Acura NSX Roadster Concept,vehicle_525885-Acura-NSX-Roadster-Concept-2012...,2
4,The Avengers,2012,2013 Acura RDX,vehicle_525890-Acura-RDX-TB4-2013.html,1
...,...,...,...,...,...
2038,The Adventures of Tintin,2012,Rolls-Royce Phantom II,vehicle_445293-Rolls-Royce-Phantom-II.html,3
2039,The Adventures of Tintin,2012,1955 Triumph TR2,vehicle_483886-Triumph-TR2-1955.html,3
2040,The Adventures of Tintin,2012,Vickers-Armstrong Centurion,vehicle_483889-Vickers-Armstrong-Centurion.html,1
2041,The Adventures of Tintin,2012,1940 Volvo LV 125,vehicle_445298-Volvo-LV-125-1940.html,2


In [40]:
#export as csv for use elsewhere
top_year_df.to_csv(f"output_data/{d_today}_{scrape_year}Cars.csv", index=False, header=True)