In [None]:
import requests
from os import path
from es_client import ElasticClient
# import ltr
# import ltr.client as client
# import ltr.index as index
# import ltr.helpers.movies as helpers

In [None]:
def download_one(uri, dest='data/', force=False):
    import os

    if not os.path.exists(dest):
        os.makedirs(dest)

    if not os.path.isdir(dest):
        raise ValueError("dest {} is not a directory".format(dest))

    filename = uri[uri.rfind('/') + 1:]
    filepath = os.path.join(dest, filename)
    if path.exists(filepath):
        if not force:
            print(filepath + ' already exists')
            return
        print("exists but force=True, Downloading anyway")

    with open(filepath, 'wb') as out:
        print('GET {}'.format(uri))
        resp = requests.get(uri, stream=True)
        for chunk in resp.iter_content(chunk_size=1024):
            if chunk:
                out.write(chunk)

def download(uris, dest='data/', force=False):
    for uri in uris:
        download_one(uri=uri, dest=dest, force=force)

In [None]:
corpus = 'http://es-learn-to-rank.labs.o19s.com/tmdb.json'
download([corpus], dest='data/')

In [None]:
client = ElasticClient()

In [None]:
import json
from tqdm import tqdm

class Memoize:
    """ Adapted from
        https://stackoverflow.com/questions/1988804/what-is-memoization-and-how-can-i-use-it-in-python"""
    def __init__(self, f):
        self.f = f
        self.memo = {}
    def __call__(self, *args):
        if not args in self.memo:
            self.memo[args] = self.f(*args)
        #Warning: You may wish to do a deepcopy here if returning objects
        return self.memo[args]

@Memoize
def load_movies(json_path):
    return json.load(open(json_path))

def get_movie(tmdb_id, movies='data/tmdb.json'):
    movies = load_movies(movies)
    tmdb_id=str(tmdb_id)
    return movies[tmdb_id]

def noop(src_movie, base_doc):
    return base_doc


def indexable_movies(enrich=noop, movies='data/tmdb.json'):
    """ Generates TMDB movies, similar to how ES Bulk indexing
    uses a generator to generate bulk index/update actions"""
    movies = load_movies(movies)
    idx = 0
    for movieId, tmdbMovie in tqdm(movies.items(),total=len(movies)):
        try:
            releaseDate = None
            if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
                releaseDate = tmdbMovie['release_date']
                releaseYear = releaseDate[0:4]

            full_poster_path = ''
            if 'poster_path' in tmdbMovie and tmdbMovie['poster_path'] is not None and len(tmdbMovie['poster_path']) > 0:
                full_poster_path = 'https://image.tmdb.org/t/p/w185' + tmdbMovie['poster_path']

            base_doc = {'id': movieId,
                        'title': tmdbMovie['title'],
                        'overview': tmdbMovie['overview'],
                        'tagline': tmdbMovie['tagline'],
                        'directors': [director['name'] for director in tmdbMovie['directors']],
                        'cast': " ".join([castMember['name'] for castMember in tmdbMovie['cast']]),
                        'genres': [genre['name'] for genre in tmdbMovie['genres']],
                        'release_date': releaseDate,
                        'release_year': releaseYear,
                        'poster_path': full_poster_path,
                        'vote_average': float(tmdbMovie['vote_average']) if 'vote_average' in tmdbMovie else None,
                        'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else 0,
                      }
            yield enrich(tmdbMovie, base_doc)
            idx += 1
        except KeyError as k: # Ignore any movies missing these attributes
            continue

In [None]:
def rebuild(client, index, doc_src, force = False):
    """ Reload a configuration on disk for each search engine
        (Solr a configset, Elasticsearch a json file)
        and reindex
    """

    if client.check_index_exists(index):
        if (force):
            client.delete_index(index)
            client.create_index(index)
            client.index_documents(index, doc_src=doc_src)
        else:
            print("Index {} already exists. Use `force = True` to delete and recreate".format(index))
            return None
    else:
        client.create_index(index)
        client.index_documents(index, doc_src=doc_src)

In [None]:
movies = indexable_movies(movies='data/tmdb.json')
rebuild(client, index='tmdb', doc_src=movies)