## Data source

### Imports

In [18]:
# pandas and numpy
import pandas as pd
import numpy as np

# nlp imports
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# sci-kit learn imports
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.pipeline import Pipeline, make_pipeline

# Presentation and visuals
import seaborn as sns
import matplotlib.pyplot as plt

# This magic line will allow you to generate plots
# within the Jupyter notebook.
%matplotlib inline
from pprint import pprint
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000
# pd.set_option(display.max_columns), None

# other imports
import json
import lxml
import random
import regex as re
import time
import urllib.request
from datetime import datetime
from pprint import pprint

np.random.seed(2020)

# Data acquisition

In [19]:
isbn = pd.read_csv('../data/processed/canadian_processed.csv')
isbn = isbn.applymap(str)
isbn.dtypes

isbn           object
title          object
author         object
Unnamed: 3     object
description    object
dtype: object

In [20]:
isbn

Unnamed: 0,isbn,title,author,Unnamed: 3,description
0,9780773524927,Two Solitudes,Hugh McLennan,,"“[A] powerful saga, [Two Solitudes is the stor..."
1,9781552453056,Fifteen Dogs,Andre Alexis,,“A bet between the gods Hermes and Apollo lead...
2,9780771030130,Bear,Marian Engel,,“A librarian is called to a remote Canadian is...
3,9781554685257,"Green Grass, Running Water",Thomas King,,“Alberta is a university professor who would l...
4,9780771055706,No Great Mischief,Alistair MacLeod,,"“Alexander, orphaned as a child by a horrific ..."
5,9780312054366,Generation X,Douglas Coupland,,"“Andy, Dag and Claire have been handed a socie..."
6,9780676977738,The Birth House,Ami McKay,,“As a child in an isolated village in Nova Sco...
7,9780062468475,Lullabies for Little Criminals,Heather O'Neill,,"“At thirteen, Baby vacillates between childhoo..."
8,9780006393108,Lost Girls,Andrew Pyper,,“Attorney Bartholomew Crane doesn’t belong in ...
9,9781443451352,Birdie,Tracey Lindberg,,"“Bernice Meetoos, a Cree woman, leaves her hom..."


In [21]:
# Susan Li 
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

stemmer = PorterStemmer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

processed_descriptions = isbn['description'].map(preprocess)
processed_descriptions

0      [power, saga, solitud, stori, athanas, tallard...
1      [god, herm, apollo, lead, grant, human, consci...
2      [librarian, call, remot, canadian, island, inv...
3      [alberta, univers, professor, like, trade, boy...
4      [alexand, orphan, child, horrif, tragedi, gain...
5      [andi, clair, hand, societi, price, mean, brin...
6      [child, isol, villag, nova, scotia, dora, rare...
7      [thirteen, babi, vacil, childhood, comfort, ad...
8      [attorney, bartholomew, crane, belong, small, ...
9      [bernic, meetoo, cree, woman, leav, home, nort...
10     [bear, daisi, stone, goodwil, drift, role, chi...
11     [case, sharpest, data, thief, busi, veng, empl...
12     [chief, inspector, armand, gamach, sûreté, qué...
13     [favorit, redhead, spunki, ann, shirley, begin...
14     [fill, remark, cast, charact, loser, cheater, ...
15     [mile, north, vancouv, kitamaat, indian, reser...
16     [gener, fish, plenti, disappear, inhabit, remo...
17     [grow, suburban, hell, m

In [22]:
dictionary = gensim.corpora.Dictionary(processed_descriptions)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 aristocrat
1 athanas
2 balanc
3 beauti
4 call
5 canadian
6 countri
7 establish
8 french
9 home
10 irish


In [23]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [24]:
bow_corpus = [dictionary.doc2bow(description) for description in processed_descriptions]
bow_corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1)],
 [(1, 1), (10, 1), (11, 2)],
 [(12, 1), (13, 1), (14, 1), (15, 1)],
 [(4, 1), (16, 1), (17, 1), (18, 2), (19, 1)],
 [(20, 1), (21, 1), (22, 2), (23, 1)],
 [(16, 1), (19, 1), (24, 1), (25, 1), (26, 1), (27, 1)],
 [(0, 1),
  (4, 1),
  (21, 1),
  (25, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1)],
 [(12, 1), (15, 2), (30, 1)],
 [(2, 1), (12, 1), (34, 1), (35, 1), (36, 1), (37, 1)],
 [(4, 1),
  (6, 1),
  (10, 1),
  (16, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1)],
 [(3, 1), (43, 1)],
 [(1, 1), (19, 1), (44, 1), (45, 1), (46, 1)],
 [(30, 1), (47, 1), (48, 2), (49, 1)],
 [(0, 1), (38, 1), (42, 1), (50, 1), (51, 1)],
 [(19, 1), (52, 1), (53, 1)],
 [(22, 2), (25, 1), (36, 1), (54, 1), (55, 1)],
 [(13, 1), (14, 1), (30, 1), (31, 1), (39, 1), (52, 2), (56, 1), (57, 3)],
 [(14, 1), (58, 1), (59, 1)],
 [(21, 1), (37, 1), (51, 1)],
 [(8, 1), (18, 1), (27, 1), (48, 1), (60, 1)

In [25]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break


[(0, 0.3307237473794839),
 (1, 0.4164322024456737),
 (2, 0.4164322024456737),
 (3, 0.3485435188148724),
 (4, 0.20381141750281898),
 (5, 0.425898109510867),
 (6, 0.44651916683886056)]


In [26]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.067*"love" + 0.064*"famili" + 0.049*"year" + 0.048*"father" + 0.031*"stori" + 0.028*"live" + 0.028*"girl" + 0.026*"work" + 0.024*"time" + 0.024*"take"
Topic: 1 
Words: 0.069*"book" + 0.048*"stori" + 0.039*"life" + 0.035*"world" + 0.034*"human" + 0.034*"home" + 0.030*"time" + 0.029*"peopl" + 0.027*"cultur" + 0.026*"reader"
Topic: 2 
Words: 0.085*"novel" + 0.060*"work" + 0.036*"publish" + 0.033*"world" + 0.030*"author" + 0.028*"year" + 0.028*"write" + 0.028*"literatur" + 0.027*"consid" + 0.027*"stori"
Topic: 3 
Words: 0.087*"time" + 0.045*"world" + 0.041*"school" + 0.037*"year" + 0.036*"human" + 0.032*"young" + 0.028*"citi" + 0.028*"life" + 0.028*"long" + 0.026*"come"
Topic: 4 
Words: 0.061*"book" + 0.048*"best" + 0.046*"novel" + 0.045*"girl" + 0.039*"time" + 0.037*"death" + 0.035*"tale" + 0.032*"woman" + 0.030*"mother" + 0.027*"town"
Topic: 5 
Words: 0.115*"novel" + 0.045*"charact" + 0.041*"great" + 0.036*"american" + 0.029*"book" + 0.027*"famili" + 0.025*"publish" + 

In [27]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.056*"american" + 0.049*"power" + 0.044*"write" + 0.040*"mysteri" + 0.032*"publish" + 0.032*"adventur" + 0.029*"classic" + 0.029*"citi" + 0.027*"like" + 0.026*"centuri"
Topic: 1 Word: 0.034*"father" + 0.030*"child" + 0.028*"struggl" + 0.028*"discov" + 0.025*"heart" + 0.024*"leav" + 0.024*"girl" + 0.022*"novel" + 0.022*"york" + 0.022*"lead"
Topic: 2 Word: 0.063*"peopl" + 0.040*"hous" + 0.039*"famili" + 0.038*"chang" + 0.031*"lead" + 0.029*"time" + 0.028*"world" + 0.028*"stori" + 0.028*"love" + 0.026*"human"
Topic: 3 Word: 0.037*"mysteri" + 0.031*"second" + 0.030*"husband" + 0.027*"come" + 0.027*"friend" + 0.027*"secret" + 0.026*"writer" + 0.024*"modern" + 0.024*"town" + 0.023*"love"
Topic: 4 Word: 0.051*"societi" + 0.044*"year" + 0.042*"bring" + 0.033*"human" + 0.032*"world" + 0.032*"fall" + 0.031*"famili" + 0.030*"women" + 0.028*"child" + 0.026*"dream"
Topic: 5 Word: 0.061*"wife" + 0.052*"grow" + 0.052*"mother" + 0.043*"secret" + 0.031*"leav" + 0.029*"death" + 0.027*"fa

In [28]:
processed_descriptions[40]

['thien',
 'take',
 'insid',
 'extend',
 'famili',
 'china',
 'show',
 'live',
 'success',
 'gener',
 'live',
 'cultur',
 'revolut',
 'twentieth',
 'centuri',
 'children',
 'survivor',
 'student',
 'protest',
 'tiananmen',
 'squar',
 'import',
 'polit',
 'moment',
 'past',
 'centuri']

## Evaluation

In [30]:
for index, score in sorted(lda_model[bow_corpus[40]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.899978518486023	 
Topic: 0.074*"novel" + 0.046*"stori" + 0.042*"young" + 0.039*"edit" + 0.036*"centuri" + 0.036*"write" + 0.035*"power" + 0.033*"publish" + 0.030*"live" + 0.028*"famili"

Score: 0.01111567486077547	 
Topic: 0.067*"love" + 0.064*"famili" + 0.049*"year" + 0.048*"father" + 0.031*"stori" + 0.028*"live" + 0.028*"girl" + 0.026*"work" + 0.024*"time" + 0.024*"take"

Score: 0.01111426018178463	 
Topic: 0.057*"novel" + 0.052*"world" + 0.049*"bear" + 0.029*"know" + 0.028*"stori" + 0.026*"leav" + 0.026*"grow" + 0.025*"like" + 0.024*"school" + 0.024*"beauti"

Score: 0.011114113964140415	 
Topic: 0.069*"book" + 0.048*"stori" + 0.039*"life" + 0.035*"world" + 0.034*"human" + 0.034*"home" + 0.030*"time" + 0.029*"peopl" + 0.027*"cultur" + 0.026*"reader"

Score: 0.011113728396594524	 
Topic: 0.086*"life" + 0.056*"love" + 0.044*"stori" + 0.041*"live" + 0.036*"novel" + 0.036*"year" + 0.035*"tell" + 0.028*"husband" + 0.028*"young" + 0.023*"hous"

Score: 0.011113516986370087	 
Topic

In [32]:
for index, score in sorted(lda_model_tfidf[bow_corpus[40]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8999820351600647	 
Topic: 0.032*"book" + 0.032*"novel" + 0.029*"gener" + 0.029*"great" + 0.027*"english" + 0.025*"tale" + 0.024*"life" + 0.022*"charact" + 0.021*"centuri" + 0.021*"bring"

Score: 0.011114891618490219	 
Topic: 0.037*"young" + 0.032*"time" + 0.031*"love" + 0.026*"woman" + 0.026*"beauti" + 0.023*"life" + 0.022*"live" + 0.022*"famili" + 0.022*"year" + 0.021*"tell"

Score: 0.011113516986370087	 
Topic: 0.051*"societi" + 0.044*"year" + 0.042*"bring" + 0.033*"human" + 0.032*"world" + 0.032*"fall" + 0.031*"famili" + 0.030*"women" + 0.028*"child" + 0.026*"dream"

Score: 0.01111337635666132	 
Topic: 0.061*"wife" + 0.052*"grow" + 0.052*"mother" + 0.043*"secret" + 0.031*"leav" + 0.029*"death" + 0.027*"famili" + 0.026*"stori" + 0.024*"girl" + 0.024*"cultur"

Score: 0.011113290674984455	 
Topic: 0.063*"peopl" + 0.040*"hous" + 0.039*"famili" + 0.038*"chang" + 0.031*"lead" + 0.029*"time" + 0.028*"world" + 0.028*"stori" + 0.028*"love" + 0.026*"human"

Score: 0.0111130932345986

In [33]:
# Test on unseen document
# Description of Gone with the Wind by Margaret Mitchell

unseen_document = 'Set against the dramatic backdrop of the American Civil War, Margaret Mitchells magnificent historical epic is an unforgettable tale of love and loss, of a nation mortally divided and a people forever changed. Above all, it is the story of beautiful, ruthless Scarlett OHara and the dashing soldier of fortune, Rhett Butler.'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8874626159667969	 Topic: 0.115*"novel" + 0.045*"charact" + 0.041*"great" + 0.036*"american" + 0.029*"book"
Score: 0.012505976483225822	 Topic: 0.069*"book" + 0.048*"stori" + 0.039*"life" + 0.035*"world" + 0.034*"human"
Score: 0.012505325488746166	 Topic: 0.086*"life" + 0.056*"love" + 0.044*"stori" + 0.041*"live" + 0.036*"novel"
Score: 0.01250483188778162	 Topic: 0.057*"novel" + 0.052*"world" + 0.049*"bear" + 0.029*"know" + 0.028*"stori"
Score: 0.012504382990300655	 Topic: 0.067*"love" + 0.064*"famili" + 0.049*"year" + 0.048*"father" + 0.031*"stori"
Score: 0.012504254467785358	 Topic: 0.061*"book" + 0.048*"best" + 0.046*"novel" + 0.045*"girl" + 0.039*"time"
Score: 0.012503734789788723	 Topic: 0.074*"novel" + 0.046*"stori" + 0.042*"young" + 0.039*"edit" + 0.036*"centuri"
Score: 0.012502993457019329	 Topic: 0.087*"time" + 0.045*"world" + 0.041*"school" + 0.037*"year" + 0.036*"human"
Score: 0.012502990663051605	 Topic: 0.085*"novel" + 0.046*"world" + 0.041*"women" + 0.038*"life" +

In [37]:
# https://www.nltk.org/howto/stem.html
# it says the english snowball stemmer is better than the original porter stemmer; try it!

(271, 9)