This program utilizes Naive Bayesian analysis to assess for a relationship between high- or low-value Jeopardy questions and the textual content of questions and answers.  

In [1]:
# The natural language toolkit for tokenizing words and sentences
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
#scikit-learn is a key data science module like pandas
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import os
import re
import config5
import pandas as pd
from string import punctuation
from datetime import datetime
from io import StringIO

english_stopwords = set(stopwords.words('english')+list(punctuation)+['..','...','....','``','//n',"''"])

In [2]:
# read the jeopardy data JSON file
with open('jeopardy.json', 'r') as jeopardy:
    data=jeopardy.read()

df = pd.read_json(StringIO(data))
df['value'] = df['value'].str.replace(',', '').str.replace('$', '')
df['value'] = pd.to_numeric(df['value'])

df.head()

  df['value'] = df['value'].str.replace(',', '').str.replace('$', '')


Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",200.0,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,200.0,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,200.0,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",200.0,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",200.0,John Adams,Jeopardy!,4680


In [4]:
# let's check what our value range looks like
value_counts = df['value'].sort_values(axis=0).value_counts(sort=False)
value_counts[:1000]

# we can see some unusual value figures from double jeopardy

5.0           9
20.0          1
22.0          1
50.0          1
100.0      9029
200.0     30455
250.0         5
300.0      8663
350.0         2
367.0         1
400.0     42244
500.0      9016
585.0         1
600.0     20377
601.0         1
700.0       203
750.0         4
796.0         1
800.0     31860
900.0       114
1000.0    21640
Name: value, dtype: int64

In [14]:
# locate the dollar amount that is a good midpoint for splitting the range of entries
half_of_answers = int(len(df)/2)
accumulated_answers=0
dollar_threshold=0
for dollar_amount, value_count in value_counts.items():
    accumulated_answers+=value_count
    if accumulated_answers > half_of_answers:
        dollar_threshold = dollar_amount
        break;

print('Anything over '+str(dollar_threshold)+' will be considered a high value question')
print ('(encompasses '+str(accumulated_answers)+' of '+str(len(df))+' answers)')

Anything over 600.0 will be considered a high value question
(encompasses 119805 of 216930 answers)


In [15]:
# prepare value arrays 
is_high_value = []
q_and_a       = []
dollar_values = []

lemmatizer = WordNetLemmatizer()

for index,row in df.iterrows():
    lemm_text = [lemmatizer.lemmatize(x) for x in word_tokenize((row['question']+' '+row['answer']).lower()) if x not in english_stopwords]
    q_and_a.append(' '.join(lemm_text))
    if row.value <= 600.0:
        is_high_value.append(0)
    else:
        is_high_value.append(1)
    dollar_values.append(row.value)

In [16]:
# make a new dataframe using our prepared value arrays
df_prepared = pd.DataFrame({"dollar_value":dollar_values,"is_high_value":is_high_value,"q_and_a":q_and_a})

In [17]:
# split out our train and test subsets
x_train, x_test, y_train, y_test = train_test_split(df_prepared.q_and_a, df_prepared.is_high_value, random_state=1)

# vectorize the text
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

# getting a whole set of vectors with words and their frequency
x_train_tf = tfidf_vectorizer.fit_transform(x_train)

# this is our second half from the split that we are preparing to use for prediction with naive bayes
x_test_tf = tfidf_vectorizer.transform(x_test)

In [19]:
# ask NB to try fitting this with our training data
naive_bayes = MultinomialNB()
naive_bayes.fit(x_train_tf, y_train)

# now try to predict for the second half to see if you can get status right
predictions = naive_bayes.predict(x_test_tf)

# how well did we do?
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:',accuracy)

Accuracy: 0.5972009662013903


We can see that this did not work very well in this case due to the complexity of the relationships involved.  I re-ran it with category included in the textual content out of curiosity and the accuracy was almost identical.

In [25]:
# drop an output file that indicates the result
with open('module 5 basics.txt', 'w') as f:
    f.write('Unsuccessful Naive Bayes analysis of $600 +/- value question relationship with question content (accuracy:'+str(accuracy)+")")