In [1]:
import pandas as pd

In [2]:
jeopardy = pd.read_csv('jeopardy.csv')

In [3]:
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
jeopardy.columns = [c.strip() for c in jeopardy.columns]
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

# Normalization

In [5]:
import string

def normalize_val(val):
    val = val.lower()
    val = ''.join([i for i in val if i not in frozenset(string.punctuation)])
    return val

print(normalize_val("abcDEF,:'"))

abcdef


In [6]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_val)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_val)

In [7]:
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [8]:
def normalize_dollar(val):
    val = ''.join([i for i in val if i not in frozenset(string.punctuation)])
    try:
        val = int(val)
    except:
        val = 0
    return val

In [9]:
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_dollar)

In [10]:
jeopardy["clean_value"].dtype

dtype('int64')

In [11]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [12]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 10 columns):
Show Number       19999 non-null int64
Air Date          19999 non-null datetime64[ns]
Round             19999 non-null object
Category          19999 non-null object
Value             19999 non-null object
Question          19999 non-null object
Answer            19999 non-null object
clean_question    19999 non-null object
clean_answer      19999 non-null object
clean_value       19999 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 1.5+ MB


# How often is a jeopardy answer deducible from the question?

There is an average of 0.06 words in common between answers and questions in the jeopardy dataset. This implies that the clue to answering the question is not likely to be found in the question itself, at least in terms of an exact word hint.

In [13]:
def calc_overlap(row):
    split_answer = row["clean_answer"].split(' ')
    split_question = row["clean_question"].split(' ')
    
    if "the" in split_answer:
        split_answer.remove("the")
    
    if len(split_answer) == 0:
        return 0
    
    match_count = 0
    for w in split_answer:
        if w in split_question:
            match_count += 1
    
    return match_count / len(split_answer)

In [14]:
jeopardy["answer_in_question"] = jeopardy.apply(calc_overlap, axis=1)

In [15]:
jeopardy["answer_in_question"].mean()

0.060352773854698942

# Recycled Questions

Over the time span of this jeopardy dataset, the average reuse of large (>6 character) words from previous questions is about 68%. This indicates that studying previous questions is a reasonably good strategy.

In [16]:
jeopardy = jeopardy.sort_values("Air Date")

In [17]:
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0.0
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,$200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200,0.0
19302,10,1984-09-21,Double Jeopardy!,1789,$200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200,0.0
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200,0.0
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,$200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200,0.0


In [18]:
question_overlap = []
terms_used = set()

for idx, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(' ')
    split_question = [w for w in split_question if len(w) >=6]
    
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
        
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap

In [19]:
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question,question_overlap
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0.0,0.0
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,$200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200,0.0,0.0
19302,10,1984-09-21,Double Jeopardy!,1789,$200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200,0.0,0.0
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200,0.0,0.5
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,$200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200,0.0,0.0


In [20]:
jeopardy.mean()

Show Number           4312.730537
clean_value            748.336267
answer_in_question       0.060353
question_overlap         0.688906
dtype: float64

# Chi Squared

According to this analysis, none of the first five terms that have been reused in multiple questions have statistically significant imbalanced distributions between high and low value answers. This is backed by p-values from chi-squared tests -- none of the sampled terms have a p-value <= 0.05.

Therefore a study strategy focused on learning only previously high value questioned would not yield better results (i.e. more winnings).

In [21]:
def is_high_value(row):
    if row["clean_value"] > 800:
        return 1
    return 0

In [22]:
jeopardy["high_value"] = jeopardy.apply(is_high_value, axis=1)

In [23]:
jeopardy["high_value"].value_counts()

0    14265
1     5734
Name: high_value, dtype: int64

In [24]:
def get_high_low_count(word):
    high_count, low_count = 0, 0
    for idx, row in jeopardy.iterrows():
        question_words = row["clean_question"].split(' ')
        for w in question_words:
            if w == word:
                if row["high_value"] == 1:
                    high_count += 1
                else:
                    low_count += 1
    return high_count, low_count
        

In [25]:
observed_expected = []
comparison_terms = list(terms_used)[:20]

for term in comparison_terms:
    observed_expected.append(get_high_low_count(term))

In [26]:
print(comparison_terms)
print(observed_expected)

['reading', 'crucible', 'pearson', 'homegrown', 'chiricahua', 'sherwood', 'members', 'pollution', 'bermuda', 'unpredictably', 'carbohydrates', '2syllable', 'peccary', 'desire', 'disinter', 'conways', 'thrush', 'middlebury', 'alcatraz', 'reykjanes']
[(7, 9), (1, 3), (0, 1), (1, 0), (2, 1), (2, 1), (14, 45), (1, 2), (0, 4), (0, 1), (0, 1), (1, 2), (0, 1), (4, 2), (0, 1), (0, 1), (0, 2), (0, 1), (2, 0), (1, 0)]


In [27]:
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

In [28]:
high_value_count, low_value_count

(5734, 14265)

In [29]:
from scipy.stats import chisquare
import numpy as np

chi_squared = []

total_row_count = jeopardy.shape[0]

for observed_high_count, observed_low_count in observed_expected:
    
    total = observed_high_count + observed_low_count
    total_prop = total / total_row_count
    
    expected_high_count = high_value_count * total_prop
    expected_low_count = low_value_count * total_prop

    observed = np.array([observed_high_count, observed_low_count])
    expected = np.array([expected_high_count, expected_low_count])

    chi_squared.append(chisquare(observed, expected))

In [30]:
chi_squared

[Power_divergenceResult(statistic=1.7788002674291046, pvalue=0.18229671571722328),
 Power_divergenceResult(statistic=0.026364433084407689, pvalue=0.87101348468892104),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=2.1177104383031944, pvalue=0.14560406868264344),
 Power_divergenceResult(statistic=2.1177104383031944, pvalue=0.14560406868264344),
 Power_divergenceResult(statistic=0.70477997249648072, pvalue=0.40118220035492103),
 Power_divergenceResult(statistic=0.031881167234403623, pvalue=0.85828871632352932),
 Power_divergenceResult(statistic=1.607851384507536, pvalue=0.20479409439225948),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.031881167234403623, pvalue=0.858288716323529