# Winning Jeopardy

In [1]:
import pandas as pd

jeopardy = pd.read_csv('jeopardy.csv')

In [2]:
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [5]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
Air Date       19999 non-null object
Round          19999 non-null object
Category       19999 non-null object
Value          19999 non-null object
Question       19999 non-null object
Answer         19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [6]:
# Write a function to normalize questions and answers.

import re

def norm_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    return text

In [7]:
# Normalize the Question column. 
jeopardy['clean_question'] = jeopardy['Question'].apply(norm_text)

In [8]:
# Normalize the Answer column. 
jeopardy['clean_answer'] = jeopardy['Answer'].apply(norm_text)

In [9]:
# Write a function to normalize dollar values:
def norm_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text) 
    except Exception:
        text = 0
    return text

In [10]:
jeopardy['clean_value'] = jeopardy['Value'].apply(norm_values)

In [11]:
# convert the Air Date column to a datetime column.
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [12]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


In [26]:
# How often the answer is deducible from the question.


def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)
# Pass the axis=1 argument to apply the function across each row.


In [27]:
jeopardy['answer_in_question'].mean()

0.06049325706933587

The answer only appears in the question about 6% of the time. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.

In [31]:
# How often new questions are repeats of older ones

question_overlap = []
terms_used = set()

jeopardy = jeopardy.sort_values("Air Date")

for index,row in jeopardy.iterrows():
    split_question = row['clean_question'].split(" ")
    match_count = 0
    split_question = [q for q in split_question if len(q) > 5]

    for word in split_question:
        if word in terms_used:
            match_count += 1
        else:
            terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap

jeopardy['question_overlap'].mean()
    

0.6894492274698669

In [34]:
def value(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value

# Determine which questions are high and low value
jeopardy['high_value'] = jeopardy.apply(value, axis=1)
jeopardy['high_value'].value_counts()

0    14265
1     5734
Name: high_value, dtype: int64

In [45]:
def word_value(word):
    low_count = 0
    high_count = 0
    for i,row in jeopardy.iterrows():
        split_question = row['clean_question'].split()
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [46]:
observed_expected = []

terms_used = list(terms_used)

comparison_terms = terms_used[:5]

In [47]:
for word in comparison_terms:
    high_low = word_value(word)
    observed_expected.append(high_low)

In [48]:
print(observed_expected)

[(1, 0), (0, 1), (0, 3), (1, 2), (0, 4)]


In [50]:
high_value_count = len(jeopardy[jeopardy['high_value'] == 1])
print(high_value_count)

5734


In [51]:
low_value_count = len(jeopardy[jeopardy['high_value'] == 0])
print(low_value_count)

14265


In [58]:
from scipy.stats import chisquare
import numpy as np

chi_squared = []

for pair in observed_expected:
    total = sum(pair)
    total_prop = total / jeopardy.shape[0]
    expected_high = total_prop * high_value_count
    expected_low = total_prop * low_value_count
    observed = np.array([pair[0], pair[1]])
    expected = np.array([expected_high, expected_low])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=1.205888538380652, pvalue=0.27214791766902047),
 Power_divergenceResult(statistic=0.03188116723440362, pvalue=0.8582887163235293),
 Power_divergenceResult(statistic=1.607851384507536, pvalue=0.20479409439225948)]