In [67]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from lxml import html
import json
import pandas as pd
import re
from src.movie_class import *

In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Code to scrape movie meta data
- Start with list of movies--uses the filtered list of movies that made it through the prefiltering stage--see movie-prefilters.ipynb
- Instantiates Movie() class objects and populates them with info scraped from tmdb.org
- This is tester code. movie_class.py was the actual script used. It scraped data and then pickled it periodically. The repeated pickling was necessary because the website occasionally refused requests. It took a series of requests to get all the desired movie meta data.
- Class objects were stored in a list for pickling.

In [15]:
recs = 5
movie_list = pd.read_csv('data/filtered_links.csv')
time_test = movie_list[:recs]
time_test.head()

Unnamed: 0,movieId,imdbId,tmdbId,count,mean,std,stat_score
0,1,114709,862.0,57309,3.893708,0.921552,5.276036
1,2,113497,8844.0,24228,3.251527,0.959851,4.691304
2,3,113228,15602.0,11804,3.142028,1.008443,4.654692
3,4,114885,31357.0,2523,2.853547,1.108531,4.516344
4,5,113041,11862.0,11714,3.058434,0.996611,4.553351


In [16]:
movie_id = 880 #antonia
# movie_id = 114750 # Dear White People
# movie_id = 55721 # bridesmaids
# movie_id = 629 #The Usual Suspects
# movie_id = 496243 #Parasite
# movie_id = 50348 # Lincoln Lawyer
# movie_id = 637 #Life is Beautiful
time_test = movie_list[:5]
time_test.head(1)

Unnamed: 0,movieId,imdbId,tmdbId,count,mean,std,stat_score
0,1,114709,862.0,57309,3.893708,0.921552,5.276036


In [17]:
movies = []
for i in range(len(time_test)):
    curr_movie = Movie()
    curr_movie.load_tmdb_features(str(time_test.iloc[i][0]), str(time_test.iloc[i][2]))
    curr_movie.load_movielens_features(str(time_test.iloc[i][4]), str(time_test.iloc[i][5]))
    movies.append(curr_movie)


### Reading in the scraped, pickled movie metadata into one dataset.

In [18]:
with open('data/mv_pkl0_400.pkl', 'rb') as input:
    d1 = pickle.load(input)

In [19]:
with open('data/mv_pkl_401-1400.pkl', 'rb') as input:
    d2 = pickle.load(input)

In [20]:
with open('data/mv_pkl1401_10200.pkl', 'rb') as input:
    d3 = pickle.load(input)

In [21]:
with open('data/mv_pkl_10200_12800.pkl', 'rb') as input:
    d4 = pickle.load(input)

In [22]:
with open('data/mv_pkl_12800_14600.pkl', 'rb') as input:
    d5 = pickle.load(input)

In [23]:
with open('data/mv_pkl_14600_15000.pkl', 'rb') as input:
    d6 = pickle.load(input)

In [37]:
movie_meta = d1[:401] + d2 + d3 + d4 + d5 + d6[1:]

In [None]:
# Creating a pickle backup of all of the movie metadata
with open('data/mv_pkl.pkl','wb') as file:
    pickle.dump(movie_meta, file)

### Converting pickled class objects to dataframe required first converting them to a dictionary.

In [100]:
# Converting the Movie class object to a DataFrame--created a 'convert_to_dict' method to assist with that.
movie_meta[0].convert_to_dict()

{'title': 'Toy Story',
 'poster_path': '/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg',
 'movielensId': '1.0',
 'movielens_mean_rating': '3.8937077945872383',
 'movielens_std_rating': '0.9215518909655429',
 'tmdbId': '862.0',
 'imdbId': 'tt0114709',
 'budget': 30000000,
 'revenue': 373554033,
 'tmdb_popularity': 47.156,
 'tmdb_vote_average': 7.9,
 'tmdb_vote_count': 12920,
 'release_date': 1995,
 'tmdb_overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'tmdb_original_ln': ('en',),
 'cast': [{'cast_id': 14,
   'character': 'Woody (voice)',
   'credit_id': '52fe4284c3a36847f8024f95',
   'gender': 2,
   'id': 31,
   'name': 'Tom Hanks',
   'order': 0,
   'profile_path': '/xndWFsBlClOJFRdhSt4NBwiPq2o.jpg'},
  {'cast_id': 15,
   'character'

In [102]:
movie_meta_clean = pd.DataFrame.from_records([s.convert_to_dict() for s in movie_meta])

In [None]:
movie_meta_clean.head(1)

### Code snippets used during building of the Movie() class

In [None]:
# Movie details from TMDB. Contains oveview, genres, budget, revenue, popularity, vote_average, vote_count, release_date, 

url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US'
r = requests.get(url, allow_redirects=False)
if r.status_code == 200:
    print(f"Success {r.status_code}, {url}")

soup = BeautifulSoup(r.content, "html")
soup.prettify()
dict = json.loads(soup.get_text())
dict

In [None]:
# Recommendations from TMDB---ooh this looks good. Love the recs under Parasite. Want to see many of these.

url = f'https://api.themoviedb.org/3/movie/{movie_id}/recommendations?api_key={api_key}&language=en-US'
r = requests.get(url, allow_redirects=False)
if r.status_code == 200:
    print(f"Success {r.status_code}, {url}")

soup = BeautifulSoup(r.content, "html")
soup.prettify()
dict = json.loads(soup.get_text())
dict['results'][0]
for mv in dict['results']:
    print(mv['popularity'], mv['vote_average'], mv['vote_count']/mv['vote_average'], mv['title'], mv['release_date'], mv['genre_ids'])

In [None]:
# Similar from TMDB -- not nearly so helpful. Similars for Parasite are movies made in Korea.
# Reviews are also not helpful. They're random and inconsistent.

movie_id = 496243
url = f'https://api.themoviedb.org/3/movie/{movie_id}/similar?api_key={api_key}&language=en-US'
r = requests.get(url, allow_redirects=False)
if r.status_code == 200:
    print(f"Success {r.status_code}, {url}")

soup = BeautifulSoup(r.content, "html")
soup.prettify()
dict = json.loads(soup.get_text())
dict['results'][0]

In [None]:
# Genre list from TMDB
movie_id = 496243
url = f'https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US'
r = requests.get(url, allow_redirects=False)
if r.status_code == 200:
    print(f"Success {r.status_code}, {url}")

soup = BeautifulSoup(r.content, "html")
jsoup.prettify()
dict = json.loads(soup.get_text())
dict


### NYT reviews. 
- API using the title returns a link to the review page. Not successful with being able to get actual reviews from the web page. 

In [None]:
# url = f'https://api.nytimes.com/svc/movies/v2/reviews/search.json?query=dear-white-people&api-key={nyt_api_key}'

# header_map = [{"User-Agent": "Mozilla/5.0"}, {"User-Agent": "XY"}, {}]

# r = requests.get(url, allow_redirects=True)
# if r.status_code == 200:
#     print(f"Success {r.status_code}, {url}")
#     soup = BeautifulSoup(r.content, "html")
#     soup.p.string
#     print(soup.prettify())
#     dict = json.loads(soup.p.get_text())
#     url2 = dict['results'][0]['link']['url']
#     for header in header_map:
#         r2 = requests.get(url2, allow_redirects=True)
#         print(r2)
#         if r2.status_code == 200:
#             print(f"Redirect success {r2.status_code}, {url2}")
#             soup2 = BeautifulSoup(r2.content, "html")
#             print(soup2.prettify())
#             json_data = soup2.find_all(type="application/ld+json")
#             for i in range(len(json_data)):
#                 json_soup = json.loads(json_data[i].string)
#                 print(json_soup)