References: https://github.com/cemoody/lda2vec

In [11]:
import json
import pandas as pd
from collections import Counter, defaultdict
import random
import gensim
import nltk
import spacy
spacy.load('en')
from spacy.lang.en import English
nltk.download('wordnet')
from gensim import corpora
import pickle

[nltk_data] Downloading package wordnet to /home/dong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Helpers

In [2]:
parser = English()

def tokenize(text):
    text = str(text)
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def find_topics(topic_scores):
    topics = []
    for topic, score in topic_scores:
        if score >= 0.1:
            topics.append(topic)
    return topics

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))


def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

[nltk_data] Downloading package stopwords to /home/dong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Training model

In [4]:
random.seed(333)
text_data = []
text = pd.read_csv('../data_raw/books_cleaned.csv')['summary']
for line in text:
    if type(line) == str:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            text_data.append(tokens)
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('../data_raw/corpus.pkl', 'wb'))
dictionary.save('../data_raw/dictionary.gensim')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [6]:
NUM_TOPICS = 15
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save(f'../models/model{NUM_TOPICS}.gensim')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:
ldamodel.print_topics(num_words=5)

[(0,
  '0.012*"odette" + 0.008*"caesar" + 0.007*"series" + 0.006*"gordianus" + 0.006*"valdivia"'),
 (1,
  '0.009*"mother" + 0.007*"family" + 0.007*"world" + 0.005*"lewis" + 0.005*"memoir"'),
 (2,
  '0.021*"american" + 0.013*"religion" + 0.009*"meacham" + 0.009*"author" + 0.008*"founding"'),
 (3,
  '0.007*"woman" + 0.007*"young" + 0.007*"victoria" + 0.006*"mother" + 0.006*"daughter"'),
 (4,
  '0.007*"seven" + 0.007*"know" + 0.007*"history" + 0.006*"culture" + 0.006*"rufus"'),
 (5,
  '0.010*"scott" + 0.005*"history" + 0.005*"journey" + 0.005*"young" + 0.005*"world"'),
 (6,
  '0.009*"story" + 0.007*"woman" + 0.006*"years" + 0.006*"father" + 0.006*"family"'),
 (7,
  '0.010*"woman" + 0.009*"world" + 0.008*"story" + 0.006*"mehrunnisa" + 0.006*"power"'),
 (8,
  '0.009*"justin" + 0.009*"years" + 0.008*"nathaniel" + 0.008*"story" + 0.007*"series"'),
 (9,
  '0.009*"adelia" + 0.008*"virgin" + 0.008*"business" + 0.007*"death" + 0.007*"richard"'),
 (10,
  '0.006*"first" + 0.006*"family" + 0.005*"fi

# Extract topics from the books

In [8]:
books = pd.read_csv('../data_raw/books_cleaned.csv')
books['summary_new'] = books['summary'].apply(prepare_text_for_lda)
books['summary_bow'] = books['summary_new'].apply(dictionary.doc2bow)
books['topic_score'] = books['summary_bow'].apply(ldamodel.get_document_topics)
books['topics'] = books['topic_score'].apply(find_topics)
topics = []
for t in books['topics']:
    topics.extend(t)

In [9]:
print(Counter(topics))

Counter({6: 10012, 10: 6297, 11: 5688, 7: 4813, 1: 4742, 14: 4553, 3: 3899, 9: 2754, 2: 2395, 4: 1842, 12: 1210, 0: 581, 8: 464, 5: 378, 13: 227})


In [12]:
json.dump(books['topics'].to_dict(), open('../data/book_idx_topics.json', 'w'))
topic_book_idx = defaultdict(list)
for book_idx, topics in books['topics'].to_dict().items():
    for topic in topics:
        topic_book_idx[topic].append(book_idx)
json.dump(topic_book_idx, open('../data/topic_book_idx.json', 'w'))