In [1]:
from elasticsearch import Elasticsearch, helpers
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm
from pprint import pprint
from itertools import islice
import pickle

In [2]:
# Connect to the "database"
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

### Build list of documents

In [3]:
def count_users():
    """Returns the number of usernames"""
    return es.indices.stats(index='ratings-index')['_all']['total']['docs']['count']

In [4]:
def find_user_ratings():
    """Generator that returns user rating info for each user"""
    query = {
        'query': {'match_all': {}}
    }
    return helpers.scan(client=es,
                        query=query,
                        index='ratings-index',
                        doc_type='ratings')

In [5]:
def find_game_ids():
    """Returns a list of unique game IDs from the database"""
    query = {
        'size': 0,
        'aggs': {'ids': {'terms': {'field': 'id', 'size': 10000}}}
    }
    buckets = es.search(index='games-index', doc_type='games', body=query)['aggregations']['ids']['buckets']
    return np.array([bucket['key'] for bucket in buckets])

In [6]:
def get_game_name(game_id):
    """Retrieve a game's name from ID"""
    return es.get(index='games-index', doc_type='games', id=game_id)['_source']['name']

In [11]:
game_ids = find_game_ids()

In [12]:
# game_names = [get_game_name(game_id) for game_id in game_ids]
game_names = dict()
for game_id in game_ids:
    game_names[game_id] = get_game_name(game_id)

In [13]:
def get_corpus():
    n = count_users()
    corpus = []
    count = 0
    for user_ratings in find_user_ratings():
        username = user_ratings['_source']['username']
        game_ratings = user_ratings['_source']['games']
        liked_games = [game_names[game['id']] for game in game_ratings if game['rating'] >= 7]  # Define "like" as a rating >=7
        corpus.append(liked_games)
        count += 1
        if count == n:
            return corpus

In [14]:
%time corpus = get_corpus()

CPU times: user 9.77 s, sys: 447 ms, total: 10.2 s
Wall time: 13.3 s


### Train vectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# Provide our own analyzer, because we don't really want the text analyzed at all
def analyzer(games_list):
    return games_list

In [17]:
vectorizer = CountVectorizer(analyzer=analyzer)

In [18]:
%time corpus_tf = vectorizer.fit_transform(corpus)

CPU times: user 2.19 s, sys: 53.4 ms, total: 2.25 s
Wall time: 2.26 s


### Train the LDA model

In [47]:
model = LatentDirichletAllocation(n_components=15,
                                  max_iter=25,
                                  learning_method='online',
                                  random_state=0,
                                  verbose=1)

In [48]:
%time model.fit(corpus_tf)

iteration: 1 of max_iter: 25
iteration: 2 of max_iter: 25
iteration: 3 of max_iter: 25
iteration: 4 of max_iter: 25
iteration: 5 of max_iter: 25
iteration: 6 of max_iter: 25
iteration: 7 of max_iter: 25
iteration: 8 of max_iter: 25
iteration: 9 of max_iter: 25
iteration: 10 of max_iter: 25
iteration: 11 of max_iter: 25
iteration: 12 of max_iter: 25
iteration: 13 of max_iter: 25
iteration: 14 of max_iter: 25
iteration: 15 of max_iter: 25
iteration: 16 of max_iter: 25
iteration: 17 of max_iter: 25
iteration: 18 of max_iter: 25
iteration: 19 of max_iter: 25
iteration: 20 of max_iter: 25
iteration: 21 of max_iter: 25
iteration: 22 of max_iter: 25
iteration: 23 of max_iter: 25
iteration: 24 of max_iter: 25
iteration: 25 of max_iter: 25
CPU times: user 56min 56s, sys: 60 s, total: 57min 56s
Wall time: 29min 35s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=25, mean_change_tol=0.001,
             n_components=15, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=1)

In [39]:
# Save model to disk
pickle.dump(model, open('lda-vectorized-15topics.model', 'wb'))

In [19]:
# Load a trained model from disk
model = pickle.load(open('./lda-vectorized.model', 'rb'))

### Inspect the LDA model

In [20]:
def print_top_words(model, feature_names, n_top_words):
    """Print the most prominent words in each topic"""
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: \n- " % topic_idx
        message += "\n- ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [21]:
# TODO: print out more detail about each game, like play time, designer, rating, mechanics, theme, year, etc
# to try and find correlation
print_top_words(model, vectorizer.get_feature_names(), 15)


Topic #0: 
- Dominion: Prosperity
- Dominion: Seaside
- Dominion: Intrigue
- 7 Wonders: Leaders
- 7 Wonders: Cities
- Dominion
- Dominion: Alchemy
- Small World: Cursed!
- Dominion: Hinterlands
- Dominion: Cornucopia
- Carcassonne: Expansion 1 – Inns & Cathedrals
- Small World: Grand Dames of Small World
- Small World: Be Not Afraid...
- Carcassonne: Expansion 2 – Traders & Builders
- Dominion: Dark Ages

Topic #1: 
- Twilight Struggle
- Battlestar Galactica: The Board Game
- Twilight Imperium (Third Edition)
- A Game of Thrones: The Board Game (Second Edition)
- Eclipse
- Android: Netrunner
- Mage Knight Board Game
- Memoir '44
- The Lord of the Rings: The Card Game
- Chaos in the Old World
- Star Wars: X-Wing Miniatures Game
- Arkham Horror
- War of the Ring (Second Edition)
- Descent: Journeys in the Dark (Second Edition)
- Runewars

Topic #2: 
- Scythe
- Pandemic Legacy: Season 1
- Blood Rage
- Codenames
- Terraforming Mars
- 7 Wonders Duel
- Dead of Winter: A Crossroads Game
- Gl

In [23]:
import pyLDAvis
from pyLDAvis import sklearn as pyLDAvis_sklearn

In [25]:
%time model_vis_data = pyLDAvis_sklearn.prepare(model, corpus_tf, vectorizer)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


CPU times: user 1min 11s, sys: 799 ms, total: 1min 12s
Wall time: 28.6 s


In [26]:
pyLDAvis.display(model_vis_data)

### Make some recommendations!

In [27]:
games_liked = ['Splendor', 'Catan', '7 Wonders']
user_features = vectorizer.transform([games_liked])

In [28]:
user_topic_dist = model.transform(user_features)

In [29]:
user_topic_dist  # User topic distribution

array([[0.01666667, 0.01666667, 0.01666669, 0.76666654, 0.01666667,
        0.01666667, 0.01666667, 0.01666667, 0.01666668, 0.01666669,
        0.01666667, 0.01666668, 0.01666672, 0.01666667, 0.01666667]])

In [30]:
np.cumsum(user_topic_dist)  # Cumalative topic distribution

array([0.01666667, 0.03333334, 0.05000003, 0.81666657, 0.83333323,
       0.8499999 , 0.86666657, 0.88333324, 0.89999991, 0.9166666 ,
       0.93333326, 0.94999994, 0.96666666, 0.98333333, 1.        ])

In [31]:
def sample_topic_index(topic_dist):
    """Given a topic distribution, return a topic index"""
    cumalative = np.cumsum(topic_dist)
    random_sample = np.random.random()
    for i in range(len(cumalative)):
        if random_sample < cumalative[i]:
            return i

In [42]:
def sample_from_topic(topic_index):
    """Given a topic index, sample a game from that topic"""
    topic_game_dist = model.components_[topic_index]
    normalised = topic_game_dist / np.max(topic_game_dist)
    cumalative = np.cumsum(normalised)
    random_sample = np.random.random()
    for i in range(len(cumalative)):
        if random_sample < cumalative[i]:
            return get_game_name(game_ids[i])

In [43]:
def recommend(topic_dist, liked_games, q=5):
    """Recommend games"""
    recommended = []
    while len(recommended) < q:
        recommendation = sample_from_topic(sample_topic_index(topic_dist))
        if recommendation not in recommended + liked_games:
            recommended.append(recommendation)
    return recommended

In [44]:
for r in recommend(user_topic_dist, games_liked):
    print('-', r)

- Arkham Horror
- Empire Builder
- Battlemist
- Time Agent
- En Garde


  if param in SKIP_IN_PATH:
  if param in SKIP_IN_PATH:
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
  if param in SKIP_IN_PATH:
  if param in SKIP_IN_PATH:
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
  if param in SKIP_IN_PATH:
  if param in SKIP_IN_PATH:
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
  if param in SKIP_IN_PATH:
  if param in SKIP_IN_PATH:
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
  if param in SKIP_IN_PATH:
  if param in SKIP_IN_PATH:
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP