<a href="https://colab.research.google.com/github/smallcats/TopicalLanguageModels/blob/master/Neural_TLM_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

import nltk
import re

from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model

from matplotlib import pyplot as plt

Using TensorFlow backend.


In [0]:
def multiples(it, n):
  for k in range(len(it)-n+1):
    yield it[k:k+n]

class TopicalLanguageModel:
  """
  Neural Topical Language Model.
  """
  def __init__(self, num_topics, window=3, filter_stopwords=True, 
               stopwords=None, filter_nonalpha=True, lower=True, 
               min_doc_length=2):
    self.num_topics = num_topics
    self.window = window
    self.filter_stopwords = filter_stopwords
    self.filter_nonalpha = filter_nonalpha
    self.min_doc_length = min_doc_length
    self.lower = lower
    if stopwords is None:
      self.stopwords = {'a', 'about', 'above', 'after', 'again', 'against', 
                        'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 
                        "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 
                        'being', 'below', 'between', 'both', 'but', 'by', 'can', 
                        'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 
                        'do', 'does', 'doesn', "doesn't", 'doing', 'don', 
                        "don't", 'down', 'during', 'each', 'few', 'for', 'from', 
                        'further', 'had', 'hadn', "hadn't", 'has', 'hasn', 
                        "hasn't", 'have', 'haven', "haven't", 'having', 'he', 
                        'her', 'here', 'hers', 'herself', 'him', 'himself', 
                        'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 
                        "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 
                        'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 
                        'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 
                        'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 
                        'once', 'only', 'or', 'other', 'our', 'ours', 
                        'ourselves', 'out', 'over', 'own', 're', 's', 'same', 
                        'shan', "shan't", 'she', "she's", 'should', "should've", 
                        'shouldn', "shouldn't", 'so', 'some', 'such', 't', 
                        'than', 'that', "that'll", 'the', 'their', 'theirs', 
                        'them', 'themselves', 'then', 'there', 'these', 'they', 
                        'this', 'those', 'through', 'to', 'too', 'under', 
                        'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 
                        'we', 'were', 'weren', "weren't", 'what', 'when', 
                        'where', 'which', 'while', 'who', 'whom', 'why', 'will', 
                        'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 
                        'you', "you'd", "you'll", "you're", "you've", 'your', 
                        'yours', 'yourself', 'yourselves'}
    else:
      self.stopwords = stopwords

  def clean(self, documents):
    cleaned_docs = [d for d in documents]

    if self.filter_nonalpha:
      cleaned_docs = [[w for w in d if re.match(r'^[a-z]+$',w)] for d in cleaned_docs]
    if self.filter_stopwords:
      cleaned_docs = [[w for w in d if w not in self.stopwords] for d in cleaned_docs]
    if self.lower:
      cleaned_docs = [[w.lower() for w in d] for d in cleaned_docs]

    return cleaned_docs


  def fit(self, documents, verbose=0):
    """
    fit(self, documents)

    documents should be a list of sentences, and a sentence a list of words.
    """
    pass

  def predict(self, init_doc, topic, method='sample'):
    pad_doc = ['<start>']*(self.window-1) + init_doc
    key = ' '.join(pad_doc[-self.window+1:])
    if method == 'sample':
      return np.random.choice([w for w,p in self.ngram_probs[topic][key]],
                              p=[p for w,p in self.ngram_probs[topic][key]])
    elif method == 'max':
      return self.ngram_probs[topic][key][np.argmax([p for w,p in self.ngram_probs[topic][key]])][0]

    elif method == 'distribution':
      return self.ngram_probs[topic][key]

    else:
      raise ValueError('Unknown method.')

  def rollout(self, init_doc, topic, method='monte-carlo', maxlen=100):
    pred_method = 'sample' if method=='monte-carlo' else 'max' if method=='greedy' else ''
    
    for k in range(maxlen-len(init_doc)):
      predicted = self.predict(init_doc, topic, pred_method)
      if predicted == '<end>': break
      init_doc.append(predicted)

    return init_doc

  def get_topics(self, doc):
    cleaned = self.clean([doc])[0]
    bow = self.topic_model.id2word.doc2bow(cleaned)
    return [dict(self.topic_model[bow]).get(k,0) for k in range(self.num_topics)]