# Project Luther

For this project, my goal is to predict the total box office gross on the opening week using a linear regression model. This notebook contains all the data scraping to create the dataframes needed. My variables of interest come from three sources: BoxOfficeMojo, Youtube, and  data

# Scrape Box Office Data Using BeautifulSoup
I will be scraping the box office data from boxofficemojo

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time

In [None]:
# define a function that can scrape top 40 movies of a given year in boxofficemojo.com
# scrape movie name, opening gross, theaters, release date
def get_opening(soup):
    table = soup.find_all('table')[4].find_all('tr')[2].find('table')
    movie_list = []
    for row in table.find_all('tr')[2:42]:
        cells = row.find_all('td')
        if len(cells) > 0:
            movie_name = cells[1].text
            opening = cells[5].text
            theaters = cells[6].text
            release_date = cells[7].text + "/" + year
            cell_dict = {'movie_name': movie_name, 'opening': opening, 'theaters': theaters, 'release_date': release_date}
            movie_list.append(cell_dict)
    return movie_list

In [None]:
# # Test code
# year = '2012'
# url = 'http://www.boxofficemojo.com/yearly/chart/?yr='
# url = url + year
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'lxml')
# table = soup.find_all('table')[4].find_all('tr')[2].find('table')
# movie_list = []
# for row in table.find_all('tr')[2:5]:
#     cells = row.find_all('td')
#     if len(cells) > 0:
#         movie_name = cells[1].text
#         opening = cells[5].text
#         theaters = cells[6].text
#         release_date = cells[7].text + "/" + year
#         cell_dict = {'movie_name': movie_name, 'opening': opening, 'theaters': theaters, 'release_date': release_date}
#         movie_list.append(cell_dict)
# movie_list

In [None]:
# Loop over selected years (2010 - 2018) to scrape movie data
years = ['2010','2011','2012','2013','2014','2015','2016','2017','2018']

def scrape_boxoffice(years):
    movie_list = []
    for year in years:
        url = 'http://www.boxofficemojo.com/yearly/chart/?yr='
        url = url + year
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        if len(movie_list) == 0:
            movie_list = get_opening(soup)
        else:
            movie_list = movie_list + get_opening(soup)
        # pause for 5seconds to limit traffic
        time.sleep(5)
        print("Successfully scraped " + year + "'s list")
    return movie_list
    
movie_list = scrape_boxoffice(years)

In [None]:
# Convert the list of dictionaries to a pandas dataframe
opening_df = pd.DataFrame(movie_list)
opening_df.head()

In [None]:
# Save dataframe as csv
opening_df.to_csv("opening_df.csv")
# Save dataframe as pickle
opening_df.to_pickle("opening_df.pkl")

### Checkpoint for loading opening box office gross dataframe

In [3]:
# Read dataframe from pickle
opening_df = pd.read_pickle('opening_df.pkl')

In [4]:
opening_df.head()

Unnamed: 0,movie_name,opening,release_date,theaters
0,Toy Story 3,"$110,307,189",6/18/2012,4028
1,Alice in Wonderland (2010),"$116,101,023",3/5/2012,3728
2,Iron Man 2,"$128,122,480",5/7/2012,4380
3,The Twilight Saga: Eclipse,"$64,832,191",6/30/2012,4468
4,Harry Potter and the Deathly Hallows Part 1,"$125,017,372",11/19/2012,4125


# Get Youtube Statistics Using Google API
Reference:
Pulled many bits of code from
https://www.analyticsvidhya.com/blog/2014/09/mining-youtube-python-social-media-analysis/

In [5]:
from apiclient.discovery import build #pip install google-api-python-client
from apiclient.errors import HttpError #pip install google-api-python-client
from oauth2client.tools import argparser #pip install oauth2client
import pandas as pd

In [6]:
DEVELOPER_KEY = "AIzaSyBDbAffV6b-r_fmOFvk6XLE1MoGTwno_bQ" 
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

In [None]:
# argparser.add_argument("--q", help="Search term", default="Black Panther trailer")
# #change the default to the search term you want to search
# argparser.add_argument("--max-results", help="Max results", default=5)
# #default number of results which are returned. It can vary from 0 - 100
# args = argparser.parse_args()
# options = args

In [15]:
#Define two functions to be utilized in the next cell:
# 1) get the video id, 2) use the video id to retrieve statistics

# Call the search.list method to retrieve results matching the specified
 # query term.
def get_video_id(movie_name):
    max_results = 1
    trailer = movie_name + "official trailer"
    search_response = youtube.search().list(
     q = trailer,
     type = "video",
     part = "id,snippet",
     maxResults = max_results
    ).execute()
    return search_response

# Get relevant video statistics using video id
def get_statistics(search_response):
    videos = {}
    for search_result in search_response.get("items", []):
        if search_result["id"]["kind"] == "youtube#video":
            videos[search_result["id"]["videoId"]] = search_result["snippet"]["title"]
    # For multiple videos per search, need to merge id's. Not necessary for 1 video
    s = ','.join(videos.keys())
    videos_list_response = youtube.videos().list(
     id = s,
     part='id,statistics'
    ).execute()
    return videos_list_response, videos

In [19]:
# Create a list of dictionaries 
final_list = []
max_results = 1
for movie in opening_df.movie_name:
    search_response = get_video_id(movie)
    video_response, videos = get_statistics(search_response)
    for i in video_response['items']:
        temp_res = dict(movie_name = movie, v_id = i['id'], v_title = videos[i['id']])
        temp_res.update(i['statistics'])
        final_list.append(temp_res)
        

In [21]:
# Convert to dataframe
youtube_df = pd.DataFrame(final_list)
youtube_df

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,movie_name,v_id,v_title,viewCount
0,110,311,0,1007,Toy Story 3,2BlMNH1QTeE,Toy Story 3 Official Trailer [HD],1507696
1,1513,601,0,11051,Alice in Wonderland (2010),9POCgSRVvf0,ALICE IN WONDERLAND | New Official Full Traile...,5664008
2,176,115,0,2304,Iron Man 2,BoohRoVA9WQ,Iron Man 2 Official Trailer #1 (2010) - Marvel...,656646
3,14271,4565,0,35855,The Twilight Saga: Eclipse,S2HIda5wSVU,THE TWILIGHT SAGA: ECLIPSE - Trailer,21973332
4,147,41,0,1299,Harry Potter and the Deathly Hallows Part 1,MxqsmsA8y5k,Harry Potter and the Deathly Hallows: Part 1 O...,413432
5,1112,354,0,15369,Inception,YoHD9XEInc0,Inception (2010) Official Trailer #1 - Christo...,3825997
6,853,4550,0,14159,Despicable Me,sUkZFetWYY0,Despicable Me Official Trailer #1 - (2010) HD,16419724
7,1153,539,0,4027,Shrek Forever After,u7__TG7swg0,'Shrek Forever After' Trailer 1 HD,6776985
8,39,12,0,253,How to Train Your Dragon,GfBHLVtbG6U,How to Train Your Dragon Official Trailer [HD],116460
9,154,344,0,2533,Tangled,JYKpIr1lSG0,Tangled - Official Trailer 2,2027612


In [22]:
# Save dataframe as csv
youtube_df.to_csv("youtube_df.csv")
# Save dataframe as pickle
youtube_df.to_pickle("youtube_df.pkl")

In [None]:
# # TEST CODE FOR 1 movie
# movie_name = 'black panther official trailer'

# def get_movie_id(movie_name):
#     movie_name = movie_name
#     max_results = 1
#     search_response = youtube.search().list(
#      q = movie_name + " official trailer",
#      type = "video",
#      part = "id,snippet",
#      maxResults = max_results
#     ).execute()
#     return search_response
# # Add each result to the appropriate list, and then display the lists of
#  # matching videos.
#  # Filter out channels, and playlists.
# def get_statistics(search_response):
#     videos = {}
#     for search_result in search_response.get("items", []):
#         if search_result["id"]["kind"] == "youtube#video":
#             videos[search_result["id"]["videoId"]] = search_result["snippet"]["title"]
#     # For multiple videos per search, need to merge id's. Not necessary for 1 video
#     s = ','.join(videos.keys())
#     videos_list_response = youtube.videos().list(
#      id=s,
#      part='id,statistics'
#     ).execute()
#     return videos_list_response

# res = []
# movie_name = "black panther"
# for i in videos_list_response['items']:
#     temp_res = dict(movie_name = movie_name, v_id = i['id'], v_title = videos[i['id']])
#     temp_res.update(i['statistics'])
#     res.append(temp_res)

### Checkpoint for loading youtube data frame

In [None]:
# Read youtube dataframe from pickle
youtube_df = pd.read_pickle('youtube_df.pkl')

# Get Google Trends Data Using Selenium

In [23]:
# !pip install selenium 
# download chromedriver: https://sites.google.com/a/chromium.org/chromedriver/downloads      

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import os
chromedriver = "/Users/petermin/Downloads/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)
driver.get("https://trends.google.com/trends/")