In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import spacy
import re, logging, warnings
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
warnings.filterwarnings("ignore",category=DeprecationWarning)

import nltk
from nltk.corpus import stopwords
import gensim
from gensim import corpora
from pprint import pprint
from gensim.utils import simple_preprocess, lemmatize
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim

In [2]:
text = pd.read_csv("../data/show_text_combined.csv")

In [None]:
text.head()

### Test LDA Run with subset of data
borrowed liberally from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
___

In [3]:
test_df = text.sample(frac=.05, random_state=231112334)

In [4]:
test_df.head()

Unnamed: 0,text,show_names
4505,1 [siren wailing in distance.] [helicopter wh...,Westside
1346,[collective chatter.] - This is my family An...,Freakshow
293,Miss Sara! Come back here! Slow down! Buying ...,Avonlea
4307,1 - [WATER DRIPPING.] - [BALL THUMPING.] [OM...,Treadstone
4372,1 Here. It's water. I was gonna make tea...,Unbelievable


In [5]:
test_df.shape

(233, 2)

In [60]:
max([len(show) for show in text["text"]])

15763371

In [20]:
show_chars = []
for show in test_df["text"]:
    show_chars.append(len(show))


In [21]:
[show for show in show_chars if show > 5_000_000]

[6643592,
 5475084,
 6388504,
 7120597,
 6432342,
 5056114,
 6654870,
 5570705,
 6140003,
 10475318,
 5375618,
 5995790,
 9091967,
 6144160,
 14341838,
 5774170,
 13358491,
 8246479,
 5079156,
 7283785,
 8661389,
 7131747]

#### Load in Stop Words from SpaCy and nltk

In [5]:
#comparing stop words from spacy and nltk
nlp = spacy.load("en")

spacy_stop_words = nlp.Defaults.stop_words

nltk_stop_words = stopwords.words("english")

In [6]:
stanford_file = open("../data/stanford_stopwords.txt", "r")
stanford_stopwords = stanford_file.read()

stanford_stopwords = stanford_stopwords.replace("\n", ",").split(",")

stanford_stopwords = set(stanford_stopwords)

stop_words = spacy_stop_words.union(stanford_stopwords)

In [7]:
len(stop_words)

457

In [8]:
stop_words = stop_words.union(nltk_stop_words)

In [9]:
len(stop_words)

489

In [10]:
f = open("../data/ranks_nl_stopwords.txt", "r")
ranks_nl_stopwords = f.read()

ranks_nl_stopwords = ranks_nl_stopwords.replace("\n", ",").replace("\t", ",").split(",")[1:]

ranks_nl_stopwords = set(ranks_nl_stopwords)

stop_words = stop_words.union(ranks_nl_stopwords)

In [11]:
f = open("../data/custom_stopwords.txt", "r")
custom_stop_words = f.read()

custom_stop_words = set(custom_stop_words.split(","))

stop_words = stop_words.union(custom_stop_words)

In [12]:
len(stop_words)

1821

In [23]:
#diff_words = []
#for word in spacy_stop_words:
#    if word not in nltk_stop_words:
#        diff_words.append(word)
#diff_words

there are more words in the spacy stop words list will try that one first then will try the nltk list
#### And now time to tokenize the text
setting deacc=True in order to remove punctuation

In [13]:
def tokenizer(text):
    for word in text:
        yield(gensim.utils.simple_preprocess(str(word), deacc=True))

test_df["text_tokenized"] = list(tokenizer(test_df["text"]))
        

In [56]:
test_df.head()

Unnamed: 0,text,show_names,text_tokenized
4372,1 Here. It's water. I was gonna make tea...,Unbelievable,"[here, it, water, was, gonna, make, tea, but, ..."
1183,"It was the empire on which the sun never set, ...",Empire,"[it, was, the, empire, on, which, the, sun, ne..."
3933,"Oh, honey, are you gorgeous! You look just lik...",The Nanny,"[oh, honey, are, you, gorgeous, you, look, jus..."
354,1 [Lively jazz music plays.] Stephen King's...,Bag of Bones,"[lively, jazz, music, plays, stephen, king, ba..."
764,"1 Watch out, Tomoya-kun! Are you all right, T...",Clannad: After Story,"[watch, out, tomoya, kun, are, you, all, right..."


#### Create bigrams and trigram models

In [77]:
bigram = gensim.models.Phrases(test_df["text_tokenized"], min_count=10, threshold=500)
#trigrams are madde by applying the same method to the bigram output that made the bigrams from the te
#trigram = gensim.models.Phrases(bigram[test_df["text_tokenized"]], threshold=500)

bigram_model = gensim.models.phrases.Phraser(bigram)
#trigram_model = gensim.models.phrases.Phraser(trigram)


#### define functions to remove stopwords, make bigrams, trigrams, then lemmatize

In [14]:
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text]

def make_bigrams(text):
    return [bigram_model[doc] for doc in text]

def make_trigrams(text):
    return [trigram_model[bigram_mod[doc]] for doc in text]

In [79]:
def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text_out = []
    for sent in text:
        doc = nlp(" ".join(sent), disable=["ner", "parser"]) 
        text_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return text_out

In [15]:
test_df["text_no_stopwords"] = remove_stopwords(test_df["text_tokenized"])
#test_df["text_no_stopwords"] = remove_stopwords(test_df["text_tokenized"])

In [81]:
text_bigrams = make_bigrams(text_no_stopwords)
#test_df["text_bigrams"] = make_bigrams(test_df["text_no_stopwords"])

In [16]:
nlp.max_length = 2_000_000

In [37]:
def multithread_lemmatizing(text, threads=3):
    pool = Pool(threads)
    results = pool.map(lemmatization, text)
    pool.close()
    pool.join()
    return results

In [17]:
#text_lemmatized = multithread_lemmatizing(text_bigrams)
#lemmatized_output = []
#for text in text_lemmatized:
#    lemmatized_output.append(text)

In [83]:
test_df["text_lemmatized"] = lemmatization(text_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
#options for lemmatization , allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']

In [21]:
def total_words(col):
    word_count = 0
    for row in col:
        word_count += len(row)
    return word_count

In [22]:
total_words(text_bigrams)

1211705

In [59]:
test_df.head()

Unnamed: 0,text,show_names,text_tokenized,text_no_stopwords
4372,1 Here. It's water. I was gonna make tea...,Unbelievable,"[here, it, water, was, gonna, make, tea, but, ...","[water, gonna, tea, honey, sip, help, judith, ..."
1183,"It was the empire on which the sun never set, ...",Empire,"[it, was, the, empire, on, which, the, sun, ne...","[empire, sun, set, blood, dried, height, brita..."
3933,"Oh, honey, are you gorgeous! You look just lik...",The Nanny,"[oh, honey, are, you, gorgeous, you, look, jus...","[honey, gorgeous, virgin, brought, crackers, m..."
354,1 [Lively jazz music plays.] Stephen King's...,Bag of Bones,"[lively, jazz, music, plays, stephen, king, ba...","[lively, jazz, music, plays, stephen, king, ba..."
764,"1 Watch out, Tomoya-kun! Are you all right, T...",Clannad: After Story,"[watch, out, tomoya, kun, are, you, all, right...","[watch, tomoya, kun, tomoya, kun, yeah, spaced..."


In [17]:
#dictionary
id2word = corpora.Dictionary(test_df["text_no_stopwords"])

#the lemmatized text
texts = test_df["text_no_stopwords"]

#term doc frequency (corpus)
corpus = [id2word.doc2bow(text) for text in texts]

In [18]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('aa', 3),
  ('abc', 1),
  ('ability', 2),
  ('abouta', 1),
  ('aboutyour', 1),
  ('absolutely', 7),
  ('abstinent', 1),
  ('accept', 1),
  ('acceptance', 1),
  ('accepted', 1),
  ('accomplish', 3),
  ('accomplishable', 1),
  ('accomplishments', 1),
  ('accountable', 1),
  ('achieve', 2),
  ('acid', 2),
  ('acknowledge', 1),
  ('acoustic', 1),
  ('acting', 13),
  ('action', 1),
  ('actions', 2),
  ('actor', 1),
  ('actress', 2),
  ('actual', 2),
  ('add', 1),
  ('addled', 2),
  ('addressing', 1),
  ('adhd', 1),
  ('admit', 1),
  ('ado', 4),
  ('adores', 1),
  ('advice', 2),
  ('affect', 2),
  ('affection', 2),
  ('afford', 2),
  ('afraid', 7),
  ('age', 4),
  ('agenda', 1),
  ('agree', 8),
  ('ahead', 3),
  ('aim', 1),
  ('aimlessly', 1),
  ('aina', 2),
  ('air', 1),
  ('airport', 3),
  ('album', 1),
  ('alcohol', 2),
  ('alcoholic', 4),
  ('alcoholics', 2),
  ('alexander', 1),
  ('alexandra', 103),
  ('alienate', 1),
  ('alike', 2),
  ('alive', 8),
  ('alley', 1),
  ('allow', 1),
  

In [19]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=12,
                                           update_every=10,
                                           chunksize=5,
                                           passes=5,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.008*"advice" + 0.006*"micki" + 0.006*"lewis" + 0.005*"lord" + '
  '0.004*"dark" + 0.004*"local" + 0.004*"helena" + 0.004*"earthquake" + '
  '0.003*"imagining" + 0.003*"station"'),
 (1,
  '0.003*"ship" + 0.003*"water" + 0.003*"earth" + 0.002*"fire" + 0.002*"lois" '
  '+ 0.002*"dr" + 0.002*"planet" + 0.002*"lost" + 0.002*"black" + '
  '0.002*"continues"'),
 (2,
  '0.004*"school" + 0.003*"money" + 0.003*"pretty" + 0.002*"friends" + '
  '0.002*"game" + 0.002*"crazy" + 0.002*"playing" + 0.002*"party" + '
  '0.002*"eat" + 0.002*"hot"'),
 (3,
  '0.004*"rosemary" + 0.002*"margaux" + 0.001*"money" + 0.001*"ro" + '
  '0.001*"frost" + 0.001*"nena" + 0.001*"julie" + 0.001*"police" + '
  '0.001*"micki" + 0.001*"marcato"'),
 (4,
  '0.004*"blood" + 0.003*"water" + 0.003*"heart" + 0.002*"food" + '
  '0.002*"doctor" + 0.002*"health" + 0.002*"fact" + 0.002*"frost" + '
  '0.002*"brain" + 0.002*"eat"'),
 (5,
  '0.011*"toast" + 0.004*"rock" + 0.002*"bloody" + 0.002*"purchase" + '
  '0.002*"ursula

In [21]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))


Perplexity:  -10.163023786135073


In [22]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=test_df["text_no_stopwords"], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.34493726356979965


In [22]:
#stop_words