In [None]:
# This notebook uses movie data, i.e. movieId, tmdbId..., from movielens to pull movie summaries from TMDb subset.
# The movie data is stored in a movie_db file using json lines that contain tmdbId, title, summary, cast and rating.

In [180]:
#from imdb_api import IMDb
# from imdb import Cinemagoer

# TMDb
from themoviedb import TMDb

import pandas as pd
import json

from requests import HTTPError

In [None]:
# helper functions

def castSubset(cast_data):
# cast_data is a list pulled from TMDb credits
# return a set with the 3 most relevant actors in the cast

    sorted_cast = sorted(cast_data, key = lambda x: x.order)
    
    return [x.name for x in sorted_cast[:3]]
  

In [2]:
# read API key to access TMDb 
with open("TMDb_key.txt") as my_key:
    api_key = my_key.read()

In [3]:
# ia = Cinemagoer()

# initialize a TMDb object and set the API key
tmdb = TMDb(key = api_key)

In [9]:
#tmdb.movies().top_rated()
movie = tmdb.movie(movie_id = 162892.0).details(append_to_response = "credits")

In [10]:
movie

Movie(id=162892, poster_path='/ytHtwRIz6De6jYFNi31wdffaXyK.jpg', adult=False, overview='A Girl Thing is a mini-series that revolves around a New York city street, a coffee house and a shrinks office. Dr. Beth Noonan is the therapist to one star per hour. Hour one deals with a woman not capable of having a relationship. Hour Two is about sisters who hate each other, trying to get along one last time. Hour Three is about adultery.', release_date=datetime.date(2001, 1, 19), genre_ids=None, original_title='A Girl Thing', original_language='en', title='A Girl Thing', backdrop_path=None, popularity=1.517, vote_count=7, video=False, vote_average=4.214, media_type=<MediaType.movie: 'movie'>, belongs_to_collection=None, budget=0, genres=[], homepage='', imdb_id='tt0249603', production_companies=[], production_countries=[], revenue=0, runtime=237, spoken_languages=[], status='Released', tagline="Sex, love, relationships, family... It's enough to drive you crazy.", alternative_titles=None, credit

In [76]:
movie.credits.cast

[Cast(id=689, adult=False, gender=1, known_for_department='Acting', name='Kate Capshaw', original_name='Kate Capshaw', popularity=29.118, profile_path='/dhwyfvVeF7TNnfMY74Wkab9iCgm.jpg', cast_id=1, character='Casey Montgomery', credit_id='52fe4c60c3a36847f822aff7', order=0, total_episode_count=None, roles=None),
 Cast(id=8893, adult=False, gender=1, known_for_department='Acting', name='Stockard Channing', original_name='Stockard Channing', popularity=30.812, profile_path='/qAYBGvapAnnhpoZuYFyoqetncs5.jpg', cast_id=5, character='Dr. Beth Noonan', credit_id='5c707e9ec3a3685a3211181b', order=1, total_episode_count=None, roles=None),
 Cast(id=28412, adult=False, gender=1, known_for_department='Acting', name='Rebecca De Mornay', original_name='Rebecca De Mornay', popularity=33.934, profile_path='/bGSvflCmfywasc3U8QAsJPaAek4.jpg', cast_id=2, character='Kim McCormack', credit_id='52fe4c60c3a36847f822affb', order=2, total_episode_count=None, roles=None),
 Cast(id=12021, adult=False, gender=1, 

In [170]:
sorted(movie.credits.cast, key = lambda x: x.order)[:3]

[Cast(id=689, adult=False, gender=1, known_for_department='Acting', name='Kate Capshaw', original_name='Kate Capshaw', popularity=29.118, profile_path='/dhwyfvVeF7TNnfMY74Wkab9iCgm.jpg', cast_id=1, character='Casey Montgomery', credit_id='52fe4c60c3a36847f822aff7', order=0, total_episode_count=None, roles=None),
 Cast(id=8893, adult=False, gender=1, known_for_department='Acting', name='Stockard Channing', original_name='Stockard Channing', popularity=30.812, profile_path='/qAYBGvapAnnhpoZuYFyoqetncs5.jpg', cast_id=5, character='Dr. Beth Noonan', credit_id='5c707e9ec3a3685a3211181b', order=1, total_episode_count=None, roles=None),
 Cast(id=28412, adult=False, gender=1, known_for_department='Acting', name='Rebecca De Mornay', original_name='Rebecca De Mornay', popularity=33.934, profile_path='/bGSvflCmfywasc3U8QAsJPaAek4.jpg', cast_id=2, character='Kim McCormack', credit_id='52fe4c60c3a36847f822affb', order=2, total_episode_count=None, roles=None)]

In [171]:
l = castSubset(movie.credits.cast)
l

['Kate Capshaw', 'Stockard Channing', 'Rebecca De Mornay']

In [172]:
# join movies.csv, links.csv and ratings.csv by movieId

movies_ml = pd.read_csv("../datasets/ml-25m/movies.csv")
links_ml = pd.read_csv("../datasets/ml-25m/links.csv")

In [192]:
movielens_data = movies_ml.join(links_ml.set_index('movieId'), on = 'movieId')
movielens_data = movielens_data.drop(columns = ['imdbId'])
movielens_data = movielens_data[~movielens_data['tmdbId'].isna()]

# movie Ids linked to the TMDb to be used to pull the summaries
ids_tmdb = movielens_data['tmdbId']
movielens_data

Unnamed: 0,movieId,title,genres,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357.0
4,5,Father of the Bride Part II (1995),Comedy,11862.0
...,...,...,...,...
62418,209157,We (2018),Drama,499546.0
62419,209159,Window of the Soul (2001),Documentary,63407.0
62420,209163,Bad Poems (2018),Comedy|Drama,553036.0
62421,209169,A Girl Thing (2001),(no genres listed),162892.0


In [174]:
title_example = movielens_data[movielens_data['tmdbId'] == ids_tmdb[1005]]['title']
title_example.values[0]

'Mary Poppins (1964)'

In [194]:
try:
    #open file "r"
    #parse file collect ids of already collected movies
    #return set of ids

    with open("movies_db", "r") as movies_file:
        movies_entries = movies_file.readlines()
        pulled_ids = set()
    
        for entry in movies_entries:
            movie_id = json.loads(entry)["id"]
            pulled_ids.add(movie_id)
    
    
except FileNotFoundError:
    pulled_ids = set()


In [202]:
with open("movies_db", "a") as outfile:

    for i, row in movielens_data.iterrows():
        
        tmdb_id = int(row.tmdbId)
        
        #check if tmdb_id has already been used
        if tmdb_id in pulled_ids:
            continue
        
        try:
            #get example from tmdb by tmdb_id using the API
            movie_tmdb_data = tmdb.movie(movie_id = tmdb_id).details(append_to_response = "credits")
            
        except HTTPError:
            continue
        
        else:
            #get info from the movie object
            #get movie title, genres from movielens database
            movie_title = row.title[:-7]
            movie_genres = row.genres
        
            movie_summary = movie_tmdb_data.overview
            movie_rating = movie_tmdb_data.vote_average
            movie_cast = castSubset(movie_tmdb_data.credits.cast)
        
            #dictionary with the relevant features
        
            ex = {"id": tmdb_id, "title": movie_title, 
                  "summary": movie_summary, 
                  "cast": movie_cast, 
                  "rating": movie_rating}
        
            #dump to file:
            outfile.write(json.dumps(ex) + "\n")
        
            pulled_ids.add(tmdb_id)
    

# in a dee end I will have a file that is one json record per line. each line representing a movie.



In [203]:
len(pulled_ids)

61522

In [204]:
movielens_data.shape

(62316, 4)

In [163]:
# for i, row in movielens_data.iterrows():
    
#     print(i, row)
#     print(row.title)
#     print(row.title[:-7])
#     print(row.tmdbId)
#     print(row.genres)
#     break

0 movieId                                              1
title                                 Toy Story (1995)
genres     Adventure|Animation|Children|Comedy|Fantasy
tmdbId                                           862.0
Name: 0, dtype: object
Toy Story (1995)
Toy Story
862.0
Adventure|Animation|Children|Comedy|Fantasy
