# Augment the Data

Let us augment the items(movie) data with more fields from IMDB and / or TMDB data

In [2]:
import numpy as np
import pandas as pd
import json, glob
import requests
import re
import tmdb
from importlib import reload
from reco.recoutils import create_directory, update_progress

## Get the Movie Features from TMDB

- We will use the "links.csv" which contains:
    - movie_id
    - imdb_id
    - tmdb_id
- Source: Scraper + Manual Enhancement
     - Bootstrapped using the scraper to get the IMDB title link
     - Enhanced through search on TMDB database to get complete list

In [3]:
links = pd.read_csv("links.csv", dtype={"tmdb_id": 'Int64' })

In [4]:
links.head()

Unnamed: 0,movie_id,imdb_id,tmdb_id
0,1,tt0114709,862
1,2,tt0113189,710
2,3,tt0113101,5
3,4,tt0113161,8012
4,5,tt0112722,1710


In [5]:
links.shape

(1682, 3)

## Get Movie Feature from TMDB 

In [6]:
sample_id = links.tmdb_id[0]
tmdb.movie(sample_id)

ConnectionError: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/862?api_key=4f23dc5304423b2d509195a634e7a702&language=en_US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001B0F3E47490>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [None]:
tmdb_ids = links.tmdb_id.tolist()

In [None]:
def get_movie_features(tmdb_ids):
    num = len(tmdb_ids)
    tick = 0
    features = []
    for i in tmdb_ids:
        feature = tmdb.movie(i)
        if feature != False: 
            features.append(feature)
        
        tick = tick + 1
        update_progress(tick / num)
    
    update_progress(1)
    return features

In [None]:
def save_movie_features(features):
    outfile = "data/features.json"
    with open(outfile, 'w') as fout:
            json.dump(features, fout)

In [None]:
# Uncomment and Run this to download movie features from tmdb

#features = get_movie_features(tmdb_ids)
#save_movie_features(features)

## Create Item Features Augment

In [None]:
df_features = pd.read_json("data/features.json")
#df_features = pd.DataFrame.from_dict(features)

In [None]:
# Select the non NA links
links_full = links[~links.tmdb_id.isna()][["movie_id", "tmdb_id"]]

In [None]:
df_item_features = pd.merge(left=df_features, right=links_full, left_on="id", right_on="tmdb_id", how="left")

In [None]:
df_item_features.head()

In [None]:
df_item_features.to_csv("data/item_features.csv", index=None)

## Get Movie Posters

In [None]:
# Create Poster Directory
create_directory("/data/posters")

In [None]:
def poster(movie_id, df):
    """
    Gets the movie poster from the movie_id
    
    poster_path (string): path for the poster
    df (pd.Dataframe): pandas dataframe with movie id and poster_path
    """
    directory_path = "data/posters/"    
    poster_path = df[df.movie_id == movie_id].poster_path.values[0]
    if poster_path == None:
        poster_path = ""
    
    save_path = directory_path + str(movie_id) + ".jpg"
    
    base_url = "https://image.tmdb.org/t/p/w92"
    url = base_url + poster_path   
    
    response = requests.request("GET", url, stream=True)
    if response.status_code == 200:
        with open(save_path, 'wb') as out_file:
            response.raw.decode_content = True 
            shutil.copyfileobj(response.raw, out_file)
        return True
    else: 
        return False

In [None]:
#df_item_features[df_item_features.poster_path.isna()][["movie_id", "poster_path"]]

In [None]:
poster(1, df_item_features)

In [None]:
from IPython.display import Image 
Image(filename='data/posters/1.jpg')

In [None]:
def get_all_posters(df):
    """
    Get all posters from the dataframe and save with movie_id
    """
    id_list = df.movie_id.tolist()
    num = len(id_list)
    tick = 0
    
    for i in id_list:
        df["image"] = poster(i, df)
        tick = tick + 1
        update_progress(tick / num)
    
    update_progress(1)
    return df

In [None]:
# Uncomment and run this to get posters
# get_all_posters(df_item_features)