## Getting Data

This notebook will use BeautifulSoup and Selenium to collect my necessary data from the web and collect it all to one data frame 

***
### Importing Libaries 
Need to import the libaries to be used in this notebook

In [1]:
from bs4 import BeautifulSoup
import requests
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from datetime import datetime
import re
from selenium.common.exceptions import NoSuchElementException
import string

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

*** 

### Set up:
- Set up lists to store data collected into
- Need to scrap Mid-Contient Libary: Based on the Book to get the list of movies and the books that they are based on 
- Clean the text in the lists so they are readable for more web scearching and scrapping  

In [2]:
movie_titles = []
book_titles = []
movie_titles_temp = []
book_titles_temp = []
movie_rating_values = []
movie_rating_counts = []
movie_released_dates = []
movie_run_times = []
book_rating_values = []
book_rating_counts = []
book_published_dates = []
book_lengths = []

In [3]:
for i in ['0-9', 'a','b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']:
    based_on_the_book = 'https://apps.mymcpl.org/botb/movie/browse/' + i

    page = requests.get(based_on_the_book).text
    soup = BeautifulSoup(page, 'html5lib')

    driver = webdriver.Chrome(chromedriver)
    driver.get(based_on_the_book)

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    for row in soup.findAll('table')[0].tbody.findAll('tr'):
        movie_column = row.findAll('td')[0].text
        movie_titles_temp.append(movie_column)
        book_column = row.findAll('td')[1].text
        book_titles_temp.append(book_column)
    
    driver.close()

In [4]:
m_fix1 = [x.replace('\n\n\nFind It \n\n\nCheck The Catalog\nBuy it on Amazon\n\n', '') for x in movie_titles_temp]

In [5]:
m_fix2 = [x.replace('\n', '') for x in m_fix1]

In [6]:
movie_titles = [x.strip() for x in m_fix2]

In [7]:
b_fix1 = [x.replace('\n\n\nFind It \n\n\nCheck The Catalog\nBuy it on Amazon\n\n', '') for x in book_titles_temp]

In [8]:
b_fix2 = [x.replace('\n', '') for x in b_fix1]

In [10]:
book_titles = [x.strip() for x in b_fix2]

In [11]:
goodreads_titles = [x.replace(',','%2C').replace('/','%2F').replace(':','%3A').replace('(', '%28').replace(')', '%29') for x in book_titles]

***

### Getting the Data: 
- Get book data by using the list of books from Mid-Contient Libary and serching the books Goodreads 
- Get movie data by using the list of movies from Mid-Contient Libary and serching the movies on IMDb

In [36]:
for book in goodreads_titles[1717:]:
    #using the show title make the goodreads url
    goodreads_search = "https://www.goodreads.com/search?utf8=%E2%9C%93&q="
    goodreads_query = goodreads_search + book.replace(' ', '+') + "&search%5Bsource%5D=goodreads&search_type=books&tab=books"

    #go to the goodreads seach page 
    page = requests.get(goodreads_query).text
    soup = BeautifulSoup(page, 'html5lib')

    driver = webdriver.Chrome(chromedriver)
    driver.get(goodreads_query)

    #parse the page
    soup = BeautifulSoup(driver.page_source, 'html.parser')


    try:
        if(driver.find_element_by_xpath('//a[contains(@class, "bookTitle")]')):

            first_book = driver.find_element_by_xpath('//a[contains(@class, "bookTitle")]')
            ##if unable to loacte, want to drop and not continue - can test on page 3##


            #need to get year before going to book 
            label = soup.find("span", class_='minirating')
            date = label.next_sibling.strip()

            #break down string with year information to just the year 
            gr_published_date = re.sub(r'[^\w\s]','',date)
            published_date = [int(i) for i in gr_published_date.split() if i.isdigit()]
            if len(published_date) == 1:
                gr_published_date = published_date[0]
            else:
                gr_published_date = 'NA'
            book_published_dates.append(gr_published_date)

            time.sleep(2)

            #choose the first book 
            first_book.click()

            #find and obtain the rating value 
            gr_rating_value = driver.find_element_by_xpath('//span[@itemprop="ratingValue"]').text
            half = float(gr_rating_value) * 2
            book_rating_values.append(half)
            #since the IMDb rating is on a 10 scale and goodreads is on a 5 scale

            #find and obatin the raiting count
            gr_rating_count = driver.find_element_by_xpath('//meta[@itemprop="ratingCount"]').get_attribute('content')
            book_rating_counts.append(float(gr_rating_count.replace(',','')))

            
            #find length of the book 
            book_length = driver.find_element_by_xpath('//span[@itemprop="numberOfPages"]').text
            gr_book_length = [int(i) for i in book_length.split() if i.isdigit()]
            book_lengths.append(gr_book_length)


    except NoSuchElementException:

            book_published_dates.append('NA')
            book_rating_values.append('NA')
            book_rating_counts.append('NA')
            book_lengths.append('NA')


    driver.close()
    


WebDriverException: Message: disconnected: Unable to receive message from renderer
  (Session info: chrome=79.0.3945.130)


In [28]:
len(book_titles)

4201

In [15]:
len(goodreads_titles)

4201

In [22]:
goodreads_titles[1608]

'White South%2C The %2F Innes%2C Hammond'

In [40]:
book_rating_values[1816]

'NA'

In [37]:
len(book_rating_values)

1817

In [55]:
for movie in movie_titles[816:]:

    imdb_search = "https://www.imdb.com/find?q=" + movie.replace('(', '%28').replace(')', '%29').replace(' ', '+') + "&ref_=nv_sr_sm"

    #go to the goodreads seach page 
    page = requests.get(imdb_search).text
    soup = BeautifulSoup(page, 'html5lib')

    driver = webdriver.Chrome(chromedriver)
    driver.get(imdb_search)
    time.sleep(2)

    #parse the page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    

    foo = movie
    foo = ''.join(foo)[:-7]
    #or title = movie[:-7]

    try:
        if(driver.find_element_by_link_text(foo)):
            
            driver.find_element_by_link_text(foo).click()

            #parse the show page 
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            
            try:
                if(driver.find_element_by_xpath('//span[@itemprop="ratingValue"]')):
                    #find and obtain the rating value 
                    movie_rating_value = driver.find_element_by_xpath('//span[@itemprop="ratingValue"]').text
                    movie_rating_values.append(float(movie_rating_value))
            
            except NoSuchElementException:
                    movie_rating_values.append('NA')

            
            try:
                if(driver.find_element_by_xpath('//span[@itemprop="ratingCount"]')):
                    #find and obtain the rating counts 
                    movie_rating_count = driver.find_element_by_xpath('//span[@itemprop="ratingCount"]').text
                    movie_rating_counts.append(float(movie_rating_count.replace(',','')))
            
            except NoSuchElementException:
                    movie_rating_counts.append('NA')

            
            try:
                if(soup.find("h4", text="Release Date:")):
                    #find the date information 
                    label = soup.find("h4", text="Release Date:")
                    date = label.next_sibling.strip()

                    #break down string with year information to just the year
                    released_date = [int(i) for i in date.split() if i.isdigit()]
                    movie_released_date = released_date[-1]
                    movie_released_dates.append(movie_released_date)
                    
            except NoSuchElementException:
                    movie_released_dates.append('NA')
                    
            
            try:
                if(driver.find_element_by_xpath('//time[contains(@datetime, "PT")]')):
            
                    #find the runtime information 
                    run_time = driver.find_element_by_xpath('//time[contains(@datetime, "PT")]').text
                    #movie_run_time = [int(i) for i in date.split() if i.isdigit()]
                    movie_run_times.append(run_time)
            
            except NoSuchElementException:
                    movie_run_times.append('NA')
            
            
            
    except NoSuchElementException:

            movie_released_dates.append('NA')
            movie_rating_values.append('NA')
            movie_rating_counts.append('NA')
            movie_run_times.append('NA')


    driver.close()

KeyboardInterrupt: 

In [29]:
driver.quit()

In [56]:
len(movie_rating_values)

1102

***

### Pulling it Together:
Pull all the book and movie data into one dataframe and saving it to a csv for further use

In [57]:
book_info = list(zip(book_titles, book_rating_values, book_rating_counts, book_published_dates, book_lengths))

In [58]:
book_data = pd.DataFrame(book_info, columns = ['Book_Title', 'Book_Rating_Value', 'Book_Rating_Count', 'Book_Publication_Year', "Book_Length"])

In [59]:
movie_info = list(zip(movie_titles, movie_rating_values, movie_rating_counts, movie_released_dates, movie_run_times,))

In [60]:
movie_data = pd.DataFrame(movie_info, columns = ['Movie_Title', 'Movie_Rating_Value','Movie_Rating_Count', 'Movie_Released_Year', 'Movie_Run_Time',])

In [61]:
info = list(zip(movie_titles, movie_rating_values, movie_rating_counts, 
                movie_released_dates, movie_run_times, book_titles, book_rating_values, 
                book_rating_counts, book_published_dates, book_lengths))

In [62]:
data = pd.DataFrame(info, columns = ['Movie_Title', 'Movie_Rating_Value', 
                                     'Movie_Rating_Count', 'Movie_Released_Year', 'Movie_Run_Time', 'Book_Title', 
                                     'Book_Rating_Value', 'Book_Rating_Count', 'Book_Publication_Year', "Book_Length"])



In [63]:
data.tail()

Unnamed: 0,Movie_Title,Movie_Rating_Value,Movie_Rating_Count,Movie_Released_Year,Movie_Run_Time,Book_Title,Book_Rating_Value,Book_Rating_Count,Book_Publication_Year,Book_Length
1090,Elephant Walk (1954),6.3,2024.0,1996.0,1h 43min,"Elephant Walk / Standish, Robert",6.0,2,1988,[278]
1091,Ella Enchanted (2004),6.3,58844.0,2020.0,1h 36min,"Ella Enchanted / Levine, Gail Carson",7.62,345746,1886,[232]
1092,Elle (2016),7.1,57006.0,1976.0,2h 10min,Oh /,7.62,345746,1886,[44]
1093,Elmer Gantry (1960),7.8,9587.0,,2h 26min,"Elmer Gantry / Lewis, Sinclair",7.54,17144,1958,[352]
1094,Emerald City (series) (2016),,,,,"Wonderful Wizard of Oz, The (series) / Baum, L...",7.4,736,1958,[154]


In [64]:
data.to_csv('movies_and_books.csv', index=False) #all pages 

In [65]:
movie_data.to_csv('movies.csv', index=False) #all pages 

In [66]:
book_data.to_csv('books.csv', index=False) #all pages 