In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import spacy
import re, logging, warnings
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
warnings.filterwarnings("ignore",category=DeprecationWarning)


import nltk
from nltk.corpus import stopwords
import gensim
from gensim import corpora
from pprint import pprint
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim

In [2]:
text = pd.read_csv("../data/show_text_combined.csv")

In [3]:
text.head()

Unnamed: 0,text,show_names
0,Can it be the breeze that fills the trees Wit...,'Allo 'Allo!
1,I'd like to marry a girl that's skinny. I thin...,'Til Death
2,It's that dream again. Just who is that guy? W...,07 Ghost
3,1 You all might have top-class credentials fro...,"1,000 Places To See Before You Die"
4,1 Welcome to 10 O'Clock Live. It's Wednesday ...,10 O'Clock Live


### Test LDA Run with subset of data
borrowed liberally from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
___

In [18]:
test_df = text.sample(frac=.25, random_state=740)

#### Load in Stop Words from SpaCy and nltk

In [4]:
#comparing stop words from spacy and nltk
nlp = spacy.load("en")

In [9]:
spacy_stop_words = nlp.Defaults.stop_words

In [10]:
len(spacy_stop_words)

326

In [13]:
nltk_stop_words = stopwords.words("english")

In [14]:
len(nltk_stop_words)

179

In [15]:
diff_words = []
for word in spacy_stop_words:
    if word not in nltk_stop_words:
        diff_words.append(word)
diff_words

['whether',
 'much',
 'made',
 'perhaps',
 'last',
 'whence',
 'wherein',
 'would',
 'hereby',
 'formerly',
 'moreover',
 'every',
 'since',
 'first',
 'whither',
 'make',
 'somehow',
 'already',
 'sixty',
 'various',
 'latterly',
 'sometime',
 'keep',
 'give',
 'go',
 'together',
 'may',
 'still',
 'ca',
 'across',
 'though',
 'per',
 'many',
 'whoever',
 'towards',
 'n’t',
 'serious',
 'else',
 'eleven',
 'three',
 'next',
 'others',
 'used',
 'among',
 'move',
 'also',
 "'ll",
 'mostly',
 "'ve",
 'twenty',
 'somewhere',
 'eight',
 'fifty',
 'thereafter',
 '‘s',
 'cannot',
 'even',
 'say',
 'empty',
 'take',
 "'d",
 'top',
 'amount',
 '’m',
 'anyone',
 'someone',
 '‘ll',
 'without',
 'hereupon',
 'side',
 '‘ve',
 '’re',
 'otherwise',
 'must',
 'show',
 'afterwards',
 'whenever',
 'bottom',
 'thru',
 'front',
 'get',
 'rather',
 'another',
 'twelve',
 'ten',
 'alone',
 'although',
 "'m",
 'ever',
 'see',
 '‘m',
 'please',
 'unless',
 'everything',
 'neither',
 'enough',
 'five',
 'whe

there are more words in the spacy stop words list will try that one first then will try the nltk list
#### And now time to tokenize the text
setting deacc=True in order to remove punctuation

In [21]:
def tokenizer(text):
    for word in text:
        yield(gensim.utils.simple_preprocess(str(word), deacc=True))

test_df["text_tokenized"] = list(tokenizer(test_df["text"]))
        

In [22]:
test_df.head()

Unnamed: 0,text,show_names,text_tokenized
727,"How about a beer, chief? How about an ID? An I...",Cheers,"[how, about, beer, chief, how, about, an, id, ..."
2160,I am fascinated by last words. Like Oscar Wil...,Looking for Alaska,"[am, fascinated, by, last, words, like, oscar,..."
3216,1 [Machine-gun fire.] Keep your heads down! W...,Six,"[machine, gun, fire, keep, your, heads, down, ..."
1244,"1 HE SPEAKS OWN LANGUAGE BONE CRACKS, HE SCRE...",Famalam,"[he, speaks, own, language, bone, cracks, he, ..."
1708,SORRY NO .idx-.sub FILES IN THE OS There is a...,Hostages,"[sorry, no, idx, sub, files, in, the, os, ther..."


#### Create bigrams and trigram models

In [24]:
bigram = gensim.models.Phrases(test_df["text_tokenized"], min_count=5, threshold=50)
#trigrams are madde by applying the same method to the bigram output that made the bigrams from the te
trigram = gensim.models.Phrases(bigram[test_df["text_tokenized"]], threshold=50)

bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)


#### define functions to remove stopwords, make bigrams, trigrams, then lemmatize

In [25]:
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in spacy_stop_words] for doc in text]

def make_bigrams(text):
    return [bigram_model[doc] for doc in text]

def make_trigrams(text):
    return [trigram_model[bigram_mod[doc]] for doc in text]

def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text_out = []
    for sent in text:
        doc = nlp(" ".join(sent)) 
        text_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return text_out


In [26]:
test_df["text_no_stopwords"] = remove_stopwords(test_df["text_tokenized"])

In [27]:
test_df["text_bigrams"] = make_bigrams(test_df["text_no_stopwords"])

In [None]:
nlp = spacy.load('en')

In [29]:
test_df["text_lemmatized"] = lemmatization(test_df["text_bigrams"], allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

ValueError: [E088] Text of length 2094722 exceeds maximum of 1000000. The v2.x parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.