In [2]:
import numpy as np
import pandas as pd
from scipy.stats import chisquare,chi2_contingency
import matplotlib.pyplot as plt

In [3]:
jeopardy=pd.read_csv('jeopardy.csv')

In [4]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
jeopardy.tail()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
19994,3582,2000-03-14,Jeopardy!,U.S. GEOGRAPHY,$200,"Of 8, 12 or 18, the number of U.S. states that...",18
19995,3582,2000-03-14,Jeopardy!,POP MUSIC PAIRINGS,$200,...& the New Power Generation,Prince
19996,3582,2000-03-14,Jeopardy!,HISTORIC PEOPLE,$200,In 1589 he was appointed professor of mathemat...,Galileo
19997,3582,2000-03-14,Jeopardy!,1998 QUOTATIONS,$200,"Before the grand jury she said, ""I'm really so...",Monica Lewinsky
19998,3582,2000-03-14,Jeopardy!,LLAMA-RAMA,$200,Llamas are the heftiest South American members...,Camels


In [6]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [7]:
jeopardy.index

RangeIndex(start=0, stop=19999, step=1)

In [9]:
jeopardy.columns=jeopardy.columns.str.replace(" ","")

In [10]:
jeopardy.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [11]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
ShowNumber    19999 non-null int64
AirDate       19999 non-null object
Round         19999 non-null object
Category      19999 non-null object
Value         19999 non-null object
Question      19999 non-null object
Answer        19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


## Normalizing Columns

In [12]:
import string
import re
def normalize_text(s):
    mytable=s.maketrans("","",string.punctuation) #creates a mapping table;replaces param1 with param2 and removes param3
    s=s.translate(mytable) #translates a string using a dictionary mapping or a mapping table. we have used a mapping table here.
    s = re.sub("[^A-Za-z0-9\s]", "", s)
    s=s.lower()
    return s
jeopardy['clean_question']=jeopardy['Question'].apply(normalize_text)
jeopardy['clean_answer']=jeopardy['Answer'].apply(normalize_text)
    

In [13]:
jeopardy['clean_answer']

0             copernicus
1             jim thorpe
2                arizona
3              mcdonalds
4             john adams
              ...       
19994                 18
19995             prince
19996            galileo
19997    monica lewinsky
19998             camels
Name: clean_answer, Length: 19999, dtype: object

In [14]:
jeopardy['clean_question']

0        for the last 8 years of his life galileo was u...
1        no 2 1912 olympian football star at carlisle i...
2        the city of yuma in this state has a record av...
3        in 1963 live on the art linkletter show this c...
4        signer of the dec of indep framer of the const...
                               ...                        
19994    of 8 12 or 18 the number of us states that tou...
19995                             the new power generation
19996    in 1589 he was appointed professor of mathemat...
19997    before the grand jury she said im really sorry...
19998    llamas are the heftiest south american members...
Name: clean_question, Length: 19999, dtype: object

In [16]:
def normalize_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

In [17]:
import datetime
jeopardy['clean_date']=pd.to_datetime(jeopardy['AirDate'])

In [18]:
jeopardy['AirDate']

0        2004-12-31
1        2004-12-31
2        2004-12-31
3        2004-12-31
4        2004-12-31
            ...    
19994    2000-03-14
19995    2000-03-14
19996    2000-03-14
19997    2000-03-14
19998    2000-03-14
Name: AirDate, Length: 19999, dtype: object

## Answers in Questions

In [19]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 11 columns):
ShowNumber        19999 non-null int64
AirDate           19999 non-null object
Round             19999 non-null object
Category          19999 non-null object
Value             19999 non-null object
Question          19999 non-null object
Answer            19999 non-null object
clean_question    19999 non-null object
clean_answer      19999 non-null object
clean_value       19999 non-null int64
clean_date        19999 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 1.7+ MB


In [26]:
def answers_in_questions(row):
    split_answer=row['clean_answer'].split()
    split_question=row['clean_question'].split()
    if "the" in split_answer:
        split_answer.remove("the")
    match_count=0
    if len(split_answer)==0:
        return 0
    else:
        for i in split_answer:
            if i in split_question:
                match_count+=1
        return float(match_count/len(split_answer))
jeopardy['answer_in_question']=jeopardy.apply(answers_in_questions,axis=1)
 
# def count_matches(row):
#     split_answer = row["clean_answer"].split()
#     split_question = row["clean_question"].split()
#     if "the" in split_answer:
#         split_answer.remove("the")
#     if len(split_answer) == 0:
#         return 0
#     match_count = 0
#     for item in split_answer:
#         if item in split_question:
#             match_count += 1
#     return match_count / len(split_answer)

# jeopardy["answer_in_question"] = jeopardy[0:3].apply(count_matches, axis=1)
    


In [31]:
jeopardy['answer_in_question'].mean()

0.059001965249777744

On average, the answer only makes up for about 6% of the question. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.

## Recycled Questions

In [60]:
question_overlap=[]
terms_used=set()
jeopardy=jeopardy.sort_values(by='clean_date')
for i,row in jeopardy.iterrows():
    split_question=row['clean_question'].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count=0
    for word in split_question:
        if word in terms_used:
            match_count+=1
    for word in split_question:
            terms_used.add(word)
    if len(split_question)>0:
        match_count=match_count/len(split_question)
    question_overlap.append(match_count)
jeopardy['question_overlap']=question_overlap

            

In [61]:
jeopardy['question_overlap'].mean()

0.6876928874002114

There is about 70% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

## Low Value vs High Value Questions

In [66]:
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)

In [67]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [68]:
from random import choice

terms_used_list = list(terms_used)
comparison_terms = [choice(terms_used_list) for _ in range(10)]

observed_expected = []

for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(0, 1),
 (2, 5),
 (0, 1),
 (0, 1),
 (0, 1),
 (20, 36),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1)]

In [69]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=3.423170782846152e-05, pvalue=0.9953317740648371),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=1.3582267046198502, pvalue=0.24384504569745835),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)]

## Chi-squared Results

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.