In [1]:
import json

from es_client import ElasticClient
from download import download

# import ltr
# import ltr.client as client
# import ltr.index as index
# import ltr.helpers.movies as helpers

from judgement import Judgment, judgments_to_file

In [2]:
corpus = 'http://es-learn-to-rank.labs.o19s.com/tmdb.json'
download([corpus], dest='data/')

data/tmdb.json already exists


In [3]:
client = ElasticClient()

In [4]:
class Memoize:
    """ Adapted from
        https://stackoverflow.com/questions/1988804/what-is-memoization-and-how-can-i-use-it-in-python"""
    def __init__(self, f):
        self.f = f
        self.memo = {}
    def __call__(self, *args):
        if not args in self.memo:
            self.memo[args] = self.f(*args)
        #Warning: You may wish to do a deepcopy here if returning objects
        return self.memo[args]

@Memoize
def load_movies(json_path):
    return json.load(open(json_path))

def get_movie(tmdb_id, movies='data/tmdb.json'):
    movies = load_movies(movies)
    tmdb_id=str(tmdb_id)
    return movies[tmdb_id]

def noop(src_movie, base_doc):
    return base_doc


def indexable_movies(enrich=noop, movies='data/tmdb.json'):
    """ Generates TMDB movies, similar to how ES Bulk indexing
    uses a generator to generate bulk index/update actions"""
    movies = load_movies(movies)
    idx = 0
    for movieId, tmdbMovie in movies.items():
        try:
            releaseDate = None
            if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
                releaseDate = tmdbMovie['release_date']
                releaseYear = releaseDate[0:4]

            full_poster_path = ''
            if 'poster_path' in tmdbMovie and tmdbMovie['poster_path'] is not None and len(tmdbMovie['poster_path']) > 0:
                full_poster_path = 'https://image.tmdb.org/t/p/w185' + tmdbMovie['poster_path']

            base_doc = {'id': movieId,
                        'title': tmdbMovie['title'],
                        'overview': tmdbMovie['overview'],
                        'tagline': tmdbMovie['tagline'],
                        'directors': [director['name'] for director in tmdbMovie['directors']],
                        'cast': " ".join([castMember['name'] for castMember in tmdbMovie['cast']]),
                        'genres': [genre['name'] for genre in tmdbMovie['genres']],
                        'release_date': releaseDate,
                        'release_year': releaseYear,
                        'poster_path': full_poster_path,
                        'vote_average': float(tmdbMovie['vote_average']) if 'vote_average' in tmdbMovie else None,
                        'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else 0,
                      }
            yield enrich(tmdbMovie, base_doc)
            idx += 1
        except KeyError as k: # Ignore any movies missing these attributes
            continue

In [5]:
def rebuild(client, index, doc_src, force = False):
    """ Reload a configuration on disk for each search engine
        (Solr a configset, Elasticsearch a json file)
        and reindex
    """

    if client.check_index_exists(index):
        if (force):
            client.delete_index(index)
            client.create_index(index)
            client.index_documents(index, doc_src=doc_src)
        else:
            print("Index {} already exists. Use `force = True` to delete and recreate".format(index))
            return None
    else:
        client.create_index(index)
        client.index_documents(index, doc_src=doc_src)

In [6]:
movies = indexable_movies(movies='data/tmdb.json')
rebuild(client, index='tmdb', doc_src=movies)

Index tmdb already exists. Use `force = True` to delete and recreate




In [7]:
# wipes out any existing LTR models/feature sets in the tmdb index
client.reset_ltr(index='tmdb')

Removed Default LTR feature store [Status: 200]
Initialize Default LTR feature store [Status: 200]


In [8]:
feature_set = {
    "featureset": {
        "features": [
            {
                "name": "release_year",
                "params": [],
                "template": {
                    "function_score": {
                        "field_value_factor": {
                            "field": "release_year",
                            "missing": 2000
                        },
                        "query": { "match_all": {} }
                    }
                }
            }
        ]
    }
}

feature_set

{'featureset': {'features': [{'name': 'release_year',
    'params': [],
    'template': {'function_score': {'field_value_factor': {'field': 'release_year',
       'missing': 2000},
      'query': {'match_all': {}}}}}]}}

In [9]:
# pushes the feature set to the tmdb index's LTR store (a hidden index)
client.create_featureset(index='tmdb', name='release', ftr_config=feature_set)

Create release feature set [Status: 201]


実際のトレーニングセットを使用する前に、モデルの2つの例を試してみます。常に新しい映画を好むもの。そして、常に古い映画を好む別のもの。興味がある場合は、これを実行した後にclassic-training.txtとlatest-training.txtを操作して、トレーニングセットがどのように見えるかを確認できます。

In [10]:
def get_classic_rating(year):
    if year > 2010:
        return 0
    elif year > 1990:
        return 1
    elif year > 1970:
        return 2
    elif year > 1950:
        return 3
    else:
        return 4

def get_latest_rating(year):
    if year > 2010:
        return 4
    elif year > 1990:
        return 3
    elif year > 1970:
        return 2
    elif year > 1950:
        return 1
    else:
        return 0

def synthesize(
    client,
    featureSet='release',
    latestTrainingSetOut='data/latest-training.txt',
    classicTrainingSetOut='data/classic-training.txt'
):
    NO_ZERO = False

    resp = client.log_query('tmdb', 'release', None)

    # A classic film fan
    judgments = []
    print("Generating 'classic' biased judgments:")
    for hit in resp:
        rating = get_classic_rating(hit['ltr_features'][0])

        if rating == 0 and NO_ZERO:
            continue

        judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords=''))


    with open(classicTrainingSetOut, 'w') as out:
        judgments_to_file(out, judgments)

    # A current film fan
    judgments = []
    print("Generating 'recent' biased judgments:")
    for hit in resp:
        rating = get_latest_rating(hit['ltr_features'][0])

        if rating == 0 and NO_ZERO:
            continue

        judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords=''))


    with open(latestTrainingSetOut, 'w') as out:
        judgments_to_file(out, judgments)

In [11]:
synthesize(
    client, 
    featureSet='release', # must match the name set in client.create_featureset(...)
    classicTrainingSetOut='data/classic-training.txt',
    latestTrainingSetOut='data/latest-training.txt'
)

{'query': {'bool': {'filter': [{'sltr': {'_name': 'logged_features', 'featureset': 'release', 'params': {}}}]}}, 'ext': {'ltr_log': {'log_specs': {'name': 'ltr_features', 'named_query': 'logged_features'}}}, 'size': 1000}
Generating 'classic' biased judgments:
Generating 'recent' biased judgments:
