In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import spacy
import re, logging, warnings
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
warnings.filterwarnings("ignore",category=DeprecationWarning)


import nltk
from nltk.corpus import stopwords
import gensim
from gensim import corpora
from pprint import pprint
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim

In [2]:
text = pd.read_csv("../data/show_text_combined.csv")

In [3]:
text.head()

Unnamed: 0,text,show_names
0,Can it be the breeze that fills the trees Wit...,'Allo 'Allo!
1,I'd like to marry a girl that's skinny. I thin...,'Til Death
2,It's that dream again. Just who is that guy? W...,07 Ghost
3,1 You all might have top-class credentials fro...,"1,000 Places To See Before You Die"
4,1 Welcome to 10 O'Clock Live. It's Wednesday ...,10 O'Clock Live


### Test LDA Run with subset of data
borrowed liberally from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
___

In [4]:
test_df = text.sample(frac=.005, random_state=412)

In [5]:
test_df.head()

Unnamed: 0,text,show_names
2735,[SWITCHBOARD BUZZES.] Paul Drake Detective Age...,Perry Mason
3592,"- Hi, is this Lina Warbler? - Yes. This is Eth...",The Class
2622,"1 We're setting off on a ten-week journey, cyc...",On Hannibal's Trail
4408,[Ringing servant's entrance bell.] Well? Mrs...,"Upstairs, Downstairs"
1176,"NARRATOR: In a few minutes, this woman will be...",Ellery Queen


In [6]:
test_df.shape

(23, 2)

#### Load in Stop Words from SpaCy and nltk

In [7]:
#comparing stop words from spacy and nltk
nlp = spacy.load("en")

In [8]:
spacy_stop_words = nlp.Defaults.stop_words

In [9]:
len(spacy_stop_words)

326

In [10]:
nltk_stop_words = stopwords.words("english")

In [11]:
len(nltk_stop_words)

179

In [12]:
diff_words = []
for word in spacy_stop_words:
    if word not in nltk_stop_words:
        diff_words.append(word)
diff_words

['move',
 'name',
 'towards',
 'else',
 'unless',
 'sometimes',
 'regarding',
 'therefore',
 "'ve",
 'meanwhile',
 'besides',
 'beforehand',
 'nothing',
 'give',
 'often',
 'seem',
 'across',
 'nobody',
 'part',
 'within',
 'hereafter',
 'serious',
 'toward',
 'take',
 '’ve',
 'everything',
 'amount',
 'none',
 'seems',
 'something',
 'since',
 'side',
 "'re",
 'rather',
 '‘ll',
 '’m',
 'per',
 'anywhere',
 'anything',
 'almost',
 'among',
 'seemed',
 'however',
 'fifty',
 'whereafter',
 'whoever',
 'please',
 'indeed',
 'mostly',
 'whatever',
 'hereupon',
 'seeming',
 '’d',
 'wherever',
 'beyond',
 'around',
 'formerly',
 'really',
 'even',
 'may',
 'one',
 'whether',
 'thru',
 'whereas',
 'hence',
 'somehow',
 'perhaps',
 'former',
 'front',
 'along',
 'without',
 'therein',
 'nevertheless',
 '’re',
 'noone',
 'nowhere',
 'everywhere',
 '’ll',
 'whence',
 'cannot',
 'us',
 'though',
 'someone',
 'upon',
 'less',
 'thereafter',
 'could',
 'everyone',
 'twelve',
 'used',
 'behind',
 'n

there are more words in the spacy stop words list will try that one first then will try the nltk list
#### And now time to tokenize the text
setting deacc=True in order to remove punctuation

In [13]:
def tokenizer(text):
    for word in text:
        yield(gensim.utils.simple_preprocess(str(word), deacc=True))

test_df["text_tokenized"] = list(tokenizer(test_df["text"]))
        

In [14]:
test_df.head()

Unnamed: 0,text,show_names,text_tokenized
2735,[SWITCHBOARD BUZZES.] Paul Drake Detective Age...,Perry Mason,"[switchboard, buzzes, paul, drake, detective, ..."
3592,"- Hi, is this Lina Warbler? - Yes. This is Eth...",The Class,"[hi, is, this, lina, warbler, yes, this, is, e..."
2622,"1 We're setting off on a ten-week journey, cyc...",On Hannibal's Trail,"[we, re, setting, off, on, ten, week, journey,..."
4408,[Ringing servant's entrance bell.] Well? Mrs...,"Upstairs, Downstairs","[ringing, servant, entrance, bell, well, mrs, ..."
1176,"NARRATOR: In a few minutes, this woman will be...",Ellery Queen,"[narrator, in, few, minutes, this, woman, will..."


#### Create bigrams and trigram models

In [15]:
bigram = gensim.models.Phrases(test_df["text_tokenized"], min_count=10, threshold=500)
#trigrams are madde by applying the same method to the bigram output that made the bigrams from the te
trigram = gensim.models.Phrases(bigram[test_df["text_tokenized"]], threshold=500)

bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)


#### define functions to remove stopwords, make bigrams, trigrams, then lemmatize

In [16]:
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in spacy_stop_words] for doc in text]

def make_bigrams(text):
    return [bigram_model[doc] for doc in text]

def make_trigrams(text):
    return [trigram_model[bigram_mod[doc]] for doc in text]

def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text_out = []
    for sent in text:
        doc = nlp(" ".join(sent)) 
        text_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return text_out


In [17]:
text_no_stopwords = remove_stopwords(test_df["text_tokenized"])
#test_df["text_no_stopwords"] = remove_stopwords(test_df["text_tokenized"])

In [18]:
text_bigrams = make_bigrams(text_no_stopwords)
#test_df["text_bigrams"] = make_bigrams(test_df["text_no_stopwords"])

In [19]:
nlp = spacy.load('en')
nlp.max_length = 2_000_000

In [20]:
test_df["text_lemmatized"] = lemmatization(text_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
#options for lemmatization , allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']

In [21]:
def total_words(col):
    word_count = 0
    for row in col:
        word_count += len(row)
    return word_count

In [22]:
total_words(text_bigrams)

1211705

In [23]:
test_df.head()

Unnamed: 0,text,show_names,text_tokenized,text_lemmatized
2735,[SWITCHBOARD BUZZES.] Paul Drake Detective Age...,Perry Mason,"[switchboard, buzzes, paul, drake, detective, ...","[buzz, want, night, number, sure, reach, wait,..."
3592,"- Hi, is this Lina Warbler? - Yes. This is Eth...",The Class,"[hi, is, this, lina, warbler, yes, this, is, e...","[remember, brown, hair, kind, funny, sort, mid..."
2622,"1 We're setting off on a ten-week journey, cyc...",On Hannibal's Trail,"[we, re, setting, off, on, ten, week, journey,...","[set, week, trail, great, carthaginian, warrio..."
4408,[Ringing servant's entrance bell.] Well? Mrs...,"Upstairs, Downstairs","[ringing, servant, entrance, bell, well, mrs, ...","[ring, servant, send, come, position, parlorma..."
1176,"NARRATOR: In a few minutes, this woman will be...",Ellery Queen,"[narrator, in, few, minutes, this, woman, will...","[woman, dead, question, kill, philander, finan..."


In [24]:
#dictionary
id2word = corpora.Dictionary(test_df["text_lemmatized"])

#the lemmatized text
texts = test_df["text_lemmatized"]

#term doc frequency (corpus)
corpus = [id2word.doc2bow(text) for text in texts]

In [36]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('-PRON-', 3),
  ('aahhh', 1),
  ('ab', 3),
  ('abandon', 10),
  ('abbey', 4),
  ('aberration', 1),
  ('abet', 2),
  ('abeyance', 1),
  ('abhor', 1),
  ('ability', 13),
  ('able', 136),
  ('abnormal', 2),
  ('abnormality', 2),
  ('aboard', 1),
  ('abouthiscar', 1),
  ('abrasion', 2),
  ('abruptly', 2),
  ('abscond', 1),
  ('absence', 4),
  ('absentminde', 1),
  ('absolute', 14),
  ('absolutely', 106),
  ('absurd', 7),
  ('abundantly', 1),
  ('abuse', 2),
  ('abusive', 4),
  ('academic', 1),
  ('accelerate', 1),
  ('acceleration', 1),
  ('accelerator', 1),
  ('accent', 15),
  ('accept', 44),
  ('acceptable', 1),
  ('access', 12),
  ('accessible', 1),
  ('accessory', 9),
  ('accident', 132),
  ('accidental', 8),
  ('accidentally', 13),
  ('accommodate', 6),
  ('accommodating', 1),
  ('accommodation', 2),
  ('accompany', 7),
  ('accomplice', 15),
  ('accomplish', 7),
  ('accomplished', 2),
  ('accomplishment', 1),
  ('accord', 76),
  ('accordance', 1),
  ('accordingly', 5),
  ('accost',

In [37]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=412,
                                           update_every=1,
                                           chunksize=5,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [38]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.025*"know" + 0.020*"go" + 0.017*"get" + 0.013*"want" + 0.012*"come" + '
  '0.012*"think" + 0.011*"look" + 0.011*"right" + 0.010*"good" + 0.009*"tell"'),
 (1,
  '0.014*"soul" + 0.008*"time" + 0.008*"let" + 0.008*"come" + 0.008*"way" + '
  '0.008*"go" + 0.008*"right" + 0.007*"want" + 0.006*"know" + 0.006*"think"'),
 (2,
  '0.015*"know" + 0.012*"tell" + 0.011*"go" + 0.010*"come" + 0.010*"think" + '
  '0.009*"get" + 0.009*"want" + 0.009*"find" + 0.009*"right" + 0.008*"thank"'),
 (3,
  '0.000*"know" + 0.000*"go" + 0.000*"get" + 0.000*"right" + 0.000*"come" + '
  '0.000*"want" + 0.000*"tell" + 0.000*"time" + 0.000*"look" + 0.000*"say"'),
 (4,
  '0.000*"know" + 0.000*"go" + 0.000*"think" + 0.000*"get" + 0.000*"tell" + '
  '0.000*"come" + 0.000*"say" + 0.000*"right" + 0.000*"time" + 0.000*"want"')]


In [39]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))


Perplexity:  -7.3622517770415365


In [41]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=test_df["text_lemmatized"], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.25962260241889445
