## Latent Dirichlet Allocation

In [10]:
import warnings
warnings.filterwarnings('ignore')

  and should_run_async(code)


In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import pyLDAvis
import pyLDAvis.gensim_models

In [12]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [13]:
from textblob import TextBlob
from wordcloud import WordCloud
import spacy

In [14]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.feature_extraction import text
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import LatentDirichletAllocation

In [15]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora

In [16]:
df = pd.read_csv("combined_season1-37.tsv.zip", delimiter='\t')

In [17]:
df

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes
0,1,100,no,LAKES & RIVERS,-,River mentioned most often in the Bible,the Jordan,1984-09-10,-
1,1,200,no,LAKES & RIVERS,-,Scottish word for lake,loch,1984-09-10,-
2,1,800,yes,LAKES & RIVERS,-,River in this famous song:,the Volga River,1984-09-10,-
3,1,400,no,LAKES & RIVERS,-,American river only 33 miles shorter than the ...,the Missouri,1984-09-10,-
4,1,500,no,LAKES & RIVERS,-,"World's largest lake, nearly 5 times as big as...",the Caspian Sea,1984-09-10,-
...,...,...,...,...,...,...,...,...,...
389440,2,400,no,FOUNDRY,-,This hefty noisemaker from Whitechapel Foundry...,Big Ben,2021-08-13,-
389441,2,800,no,FOUNDRY,-,"Around 4,000 years ago, the first foundries in...",bronze,2021-08-13,-
389442,2,1200,no,FOUNDRY,-,Several different foundries worked for 4 month...,Monitor,2021-08-13,-
389443,2,1600,no,FOUNDRY,-,"Once one of the largest of its kind, the Gary ...",U.S. Steel,2021-08-13,-


In [18]:
df['air_date'] = pd.to_datetime(df['air_date'])

In [19]:
#On 11/26/2001, the values for the questions doubled for both rounds of Jeopardy. Need to adjust the earlier episodes to have the same values as post-11/26/2001 shows.
df.loc[df['air_date'] < '2001-11-26', "value"] = df.value * 2

In [20]:
#remove Daily Doubles since the contestants can wager any amounts for those
df = df[df["daily_double"] != 'yes']

In [21]:
#keep only standard values (this will remove Final Jeopardy questions, which do not have a set amount and set are at '0', as well as the handful of non-standard values that are likely typos)
df = df.loc[df['value'].isin([200, 400, 600, 800, 1000, 400, 800, 1200, 1600, 2000])]

In [22]:
#create a new column that contains questions, answers, and category
df["q_a_and_cat"] = df["answer"] + ' ' + df["question"] + ' ' + df["category"]

In [40]:
#tokenize words in "q_and_a" column
df['q_and_a'] = df['q_and_a'].apply(word_tokenize)

#lemmatize "q_and_a" column
lemmatizer = WordNetLemmatizer()
df['q_and_a'] = df['q_and_a'].apply(lambda row:[lemmatizer.lemmatize(word) for word in row])

df

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes,q_and_a
0,1,200,no,LAKES & RIVERS,-,River mentioned most often in the Bible,the Jordan,1984-09-10,-,"[River, mentioned, most, often, in, the, Bible..."
1,1,400,no,LAKES & RIVERS,-,Scottish word for lake,loch,1984-09-10,-,"[Scottish, word, for, lake, loch]"
3,1,800,no,LAKES & RIVERS,-,American river only 33 miles shorter than the ...,the Missouri,1984-09-10,-,"[American, river, only, 33, mile, shorter, tha..."
4,1,1000,no,LAKES & RIVERS,-,"World's largest lake, nearly 5 times as big as...",the Caspian Sea,1984-09-10,-,"[World, 's, largest, lake, ,, nearly, 5, time,..."
5,1,200,no,INVENTIONS,-,Marconi's wonderful wireless,the radio,1984-09-10,-,"[Marconi, 's, wonderful, wireless, the, radio]"
...,...,...,...,...,...,...,...,...,...,...
389438,2,1600,no,LOST,-,"In ""A Moveable Feast"", Gertrude Stein is quote...",Lost Generation,2021-08-13,-,"[In, ``, A, Moveable, Feast, '', ,, Gertrude, ..."
389440,2,400,no,FOUNDRY,-,This hefty noisemaker from Whitechapel Foundry...,Big Ben,2021-08-13,-,"[This, hefty, noisemaker, from, Whitechapel, F..."
389441,2,800,no,FOUNDRY,-,"Around 4,000 years ago, the first foundries in...",bronze,2021-08-13,-,"[Around, 4,000, year, ago, ,, the, first, foun..."
389442,2,1200,no,FOUNDRY,-,Several different foundries worked for 4 month...,Monitor,2021-08-13,-,"[Several, different, foundry, worked, for, 4, ..."


In [76]:
#change the column so it's not a list of lists
df['q_and_a'] = df['q_and_a'].str.join(" ")
df

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes,q_and_a
0,1,200,no,LAKES & RIVERS,-,River mentioned most often in the Bible,the Jordan,1984-09-10,-,R i v e r m e n t i o n e d m o s t o f ...
1,1,400,no,LAKES & RIVERS,-,Scottish word for lake,loch,1984-09-10,-,S c o t t i s h w o r d f o r l a k e ...
3,1,800,no,LAKES & RIVERS,-,American river only 33 miles shorter than the ...,the Missouri,1984-09-10,-,A m e r i c a n r i v e r o n l y 3 3 ...
4,1,1000,no,LAKES & RIVERS,-,"World's largest lake, nearly 5 times as big as...",the Caspian Sea,1984-09-10,-,"W o r l d ' s l a r g e s t l a k e , n ..."
5,1,200,no,INVENTIONS,-,Marconi's wonderful wireless,the radio,1984-09-10,-,M a r c o n i ' s w o n d e r f u l w i r ...
...,...,...,...,...,...,...,...,...,...,...
389438,2,1600,no,LOST,-,"In ""A Moveable Feast"", Gertrude Stein is quote...",Lost Generation,2021-08-13,-,"I n "" A M o v e a b l e F e a s t "" , ..."
389440,2,400,no,FOUNDRY,-,This hefty noisemaker from Whitechapel Foundry...,Big Ben,2021-08-13,-,T h i s h e f t y n o i s e m a k e r f ...
389441,2,800,no,FOUNDRY,-,"Around 4,000 years ago, the first foundries in...",bronze,2021-08-13,-,"A r o u n d 4 , 0 0 0 y e a r s a g o , ..."
389442,2,1200,no,FOUNDRY,-,Several different foundries worked for 4 month...,Monitor,2021-08-13,-,S e v e r a l d i f f e r e n t f o u n d ...


In [45]:
#remove stopwords
stop = stopwords.words('english')
df['q_and_a'] = df['q_and_a'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes,q_and_a
0,1,200,no,LAKES & RIVERS,-,River mentioned most often in the Bible,the Jordan,1984-09-10,-,River mentioned often Bible Jordan
1,1,400,no,LAKES & RIVERS,-,Scottish word for lake,loch,1984-09-10,-,Scottish word lake loch
3,1,800,no,LAKES & RIVERS,-,American river only 33 miles shorter than the ...,the Missouri,1984-09-10,-,American river 33 mile shorter Mississippi Mis...
4,1,1000,no,LAKES & RIVERS,-,"World's largest lake, nearly 5 times as big as...",the Caspian Sea,1984-09-10,-,"World 's largest lake , nearly 5 time big Supe..."
5,1,200,no,INVENTIONS,-,Marconi's wonderful wireless,the radio,1984-09-10,-,Marconi 's wonderful wireless radio
...,...,...,...,...,...,...,...,...,...,...
389438,2,1600,no,LOST,-,"In ""A Moveable Feast"", Gertrude Stein is quote...",Lost Generation,2021-08-13,-,"In `` A Moveable Feast '' , Gertrude Stein quo..."
389440,2,400,no,FOUNDRY,-,This hefty noisemaker from Whitechapel Foundry...,Big Ben,2021-08-13,-,This hefty noisemaker Whitechapel Foundry bega...
389441,2,800,no,FOUNDRY,-,"Around 4,000 years ago, the first foundries in...",bronze,2021-08-13,-,"Around 4,000 year ago , first foundry Mesopota..."
389442,2,1200,no,FOUNDRY,-,Several different foundries worked for 4 month...,Monitor,2021-08-13,-,Several different foundry worked 4 month build...


## NLP Data Cleaning

In [46]:
# import string to remove punctuation

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [47]:
def no_punctuation(text):
    nopunct=[words for words in text if words not in string.punctuation]
    words_without_punct=''.join(nopunct)
    return words_without_punct

In [48]:
#remove punctuation and lowercase words in 'category,' 'answer,' and 'question'

df['q_and_a'] = df['q_and_a'].apply(lambda x: no_punctuation(x).lower())

In [49]:
#remove numerals 

df['q_and_a'] = df['q_and_a'].str.replace('\d+', '')

In [1]:
df

NameError: name 'df' is not defined

### LDA with sklearn

In [21]:
#Use CountVectorizer to make a bag of words matrix, excluding stopwords, triwords
vectorizer = CountVectorizer(max_features=1000, stop_words='english', ngram_range= (1, 3))
vectorized_answer = vectorizer.fit_transform(df.answer)

In [22]:
vectorized_answer.shape

(363765, 1000)

In [26]:
#Run LDA
lda = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=54)
lda.fit_transform(vectorized_answer)

array([[0.03333333, 0.03333333, 0.03333333, ..., 0.36666665, 0.03333333,
        0.36666668],
       [0.025     , 0.025     , 0.525     , ..., 0.025     , 0.025     ,
        0.025     ],
       [0.025     , 0.025     , 0.025     , ..., 0.025     , 0.27499606,
        0.52499812],
       ...,
       [0.03333333, 0.03333333, 0.03333333, ..., 0.03333333, 0.36666666,
        0.03333333],
       [0.27499999, 0.025     , 0.025     , ..., 0.025     , 0.025     ,
        0.025     ],
       [0.025     , 0.025     , 0.025     , ..., 0.025     , 0.025     ,
        0.52499765]])

In [29]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))                        

    print()

In [30]:
print("\nTopics in LDA model:")
tf_feature_names = vectorizer.get_feature_names()

print_top_words(lda, tf_feature_names, 40)


Topics in LDA model:
Topic #0:
said letter people play hit dont way family know shows short death god head wife reports led girl leader blue law want includes takes refers thomas told citys books killed baby paul tom central animals hero drama money lord european
Topic #1:
new th war make life century york body took gave team including battle german father countrys general robert kids court military look things april real ship june run female bad daughter royal mans delivers square foot prince united civil yearold
Topic #2:
word clue country book song says company little use red musical number near right large help color car light late ones women building david heart win flag dance region piece does san mary sun western doesnt winner county bridge governor
Topic #3:
called crew years time king won im meaning latin set north museum founded story high feet university second night youll college art dog african oscar bc begins held event stands charles species going original islands field

### LDA with Gensim

In [29]:
df

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes,q_a_and_cat
0,1,200,no,LAKES & RIVERS,-,River mentioned most often in the Bible,the Jordan,1984-09-10,-,River mentioned most often in the Bible the Jo...
1,1,400,no,LAKES & RIVERS,-,Scottish word for lake,loch,1984-09-10,-,Scottish word for lake loch LAKES & RIVERS
3,1,800,no,LAKES & RIVERS,-,American river only 33 miles shorter than the ...,the Missouri,1984-09-10,-,American river only 33 miles shorter than the ...
4,1,1000,no,LAKES & RIVERS,-,"World's largest lake, nearly 5 times as big as...",the Caspian Sea,1984-09-10,-,"World's largest lake, nearly 5 times as big as..."
5,1,200,no,INVENTIONS,-,Marconi's wonderful wireless,the radio,1984-09-10,-,Marconi's wonderful wireless the radio INVENTIONS
...,...,...,...,...,...,...,...,...,...,...
389438,2,1600,no,LOST,-,"In ""A Moveable Feast"", Gertrude Stein is quote...",Lost Generation,2021-08-13,-,"In ""A Moveable Feast"", Gertrude Stein is quote..."
389440,2,400,no,FOUNDRY,-,This hefty noisemaker from Whitechapel Foundry...,Big Ben,2021-08-13,-,This hefty noisemaker from Whitechapel Foundry...
389441,2,800,no,FOUNDRY,-,"Around 4,000 years ago, the first foundries in...",bronze,2021-08-13,-,"Around 4,000 years ago, the first foundries in..."
389442,2,1200,no,FOUNDRY,-,Several different foundries worked for 4 month...,Monitor,2021-08-13,-,Several different foundries worked for 4 month...


In [24]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [25]:
#tokenize text and remove punctuation

def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
data_words = list(sent_to_words(df.q_a_and_cat))
print(data_words[:1])

[['river', 'mentioned', 'most', 'often', 'in', 'the', 'bible', 'the', 'jordan', 'lakes', 'rivers']]


In [26]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['river', 'mentioned', 'most', 'often', 'in', 'the', 'bible', 'the', 'jordan', 'lakes_rivers']


In [27]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [30]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

KeyboardInterrupt: 

In [28]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

[[(0, 1), (1, 1)]]


In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=40, 
                                           random_state=100,
                                           update_every=5,
                                           chunksize=1000,
                                           passes=4,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the keyword of topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [41]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [25]:
pip install pyldavis

Collecting pyldavis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 2.0 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting numpy>=1.20.0
  Downloading numpy-1.22.2-cp38-cp38-macosx_10_14_x86_64.whl (17.6 MB)
[K     |████████████████████████████████| 17.6 MB 5.4 MB/s eta 0:00:01    |▋                               | 337 kB 2.0 MB/s eta 0:00:09     |███▎                            | 1.8 MB 2.0 MB/s eta 0:00:08     |███▊                            | 2.0 MB 2.0 MB/s eta 0:00:08     |████████████████████▏           | 11.1 MB 239 kB/s eta 0:00:28
[?25hCollecting pandas>=1.2.0
  Downloading pandas-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl (11.4 MB)
[K     |████████████████████████████████| 11.4 MB 1.6 MB/s eta 0:00:01    |████████▌                       | 