In [2]:
import pandas as pd
import re
import numpy as np

In [3]:
df = pd.read_csv('jeopardy.csv')

In [4]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
df.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [6]:
df.columns = df.columns.str.strip()

In [7]:
df.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [8]:
def normalize(st):
    st = st.lower()
    return re.sub(r'[^\w\s]','',st)

In [9]:
df['Question'].head()

0    For the last 8 years of his life, Galileo was ...
1    No. 2: 1912 Olympian; football star at Carlisl...
2    The city of Yuma in this state has a record av...
3    In 1963, live on "The Art Linkletter Show", th...
4    Signer of the Dec. of Indep., framer of the Co...
Name: Question, dtype: object

In [10]:
df['clean_question'] = df['Question'].apply(normalize)

In [11]:
df.clean_question.head()

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [12]:
df.Answer.head()

0    Copernicus
1    Jim Thorpe
2       Arizona
3    McDonald's
4    John Adams
Name: Answer, dtype: object

In [13]:
df['clean_answer'] = df.Answer.apply(normalize)

In [14]:
df.clean_answer.head()

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

In [15]:
df.Value.head()

0    $200
1    $200
2    $200
3    $200
4    $200
Name: Value, dtype: object

In [16]:
def normalize_dollars(st):
    try:
        return int(re.sub(r'[^\w\s]','',st))
    except:
        return 0    

In [17]:
df['clean_value'] = df.Value.apply(normalize_dollars)

In [18]:
df.clean_value.head()

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64

In [19]:
df['Air Date'].head()

0    2004-12-31
1    2004-12-31
2    2004-12-31
3    2004-12-31
4    2004-12-31
Name: Air Date, dtype: object

In [20]:
df['Air Date'] = pd.to_datetime(df['Air Date'])

In [21]:
df['Air Date'].head()

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]

In [22]:
'a' in 'ab'

True

In [23]:
['ab','a'].remove('ab')

In [24]:
df.iloc[0].clean_answer.split('r')

['cope', 'nicus']

In [25]:
def aiq(row):
    split_answer = row.clean_answer.split(' ')
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for word in split_answer:
        if word in row.clean_question:
            match_count += 1
    return match_count / len(split_answer)


In [26]:
df.apply(aiq, axis=1).mean()

0.08865522363697204

it appears that, on average, only 8% of the words in the answer are present in the question

In [27]:
df['answer_in_question'] = df.apply(aiq, axis=1)

In [28]:
df['answer_in_question'].value_counts().head()

0.000000    16181
0.500000     2017
0.333333      868
0.250000      247
1.000000      190
Name: answer_in_question, dtype: int64

# recycled questions

In [29]:
question_overlap = []
terms_used = set()

In [30]:
for index, series in df.iterrows():
    split_question = series.clean_question.split(' ')
    for word in split_question:
        if len(word) < 6:
            split_question.remove(word)
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)

In [31]:
question_overlap[:5]

[0.0, 0.08333333333333333, 0.1, 0.1111111111111111, 0.3333333333333333]

In [32]:
df['question_overlap'] = question_overlap

In [33]:
df.question_overlap.head()

0    0.000000
1    0.083333
2    0.100000
3    0.111111
4    0.333333
Name: question_overlap, dtype: float64

In [34]:
df.question_overlap.mean()

0.8040316262530017

it appears that around 80% of the meaningful words in the questions have been used in previous questions, so it seems that studying past jeopardy questions is a great way to prepare oneself for a competition.

# observed counts

In [38]:
def high(row):
    if row.clean_value > 800:
        return 1
    else:
        return 0
df['high_value'] = df.apply(high, axis=1)

In [40]:
df.high_value.value_counts()

0    14265
1     5734
Name: high_value, dtype: int64

In [50]:
def hclc(word):
    low_count = 0
    high_count = 0
    for i, row in df.iterrows():
        split_q = row.clean_question.split(' ')
        if word in split_q:
            if row.high_value:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [65]:
observed_expected = []

In [62]:
comparison_terms = list(terms_used)[1:6]

In [69]:
comparison_terms

['lotto', 'excitement', 'punish', 'rapper', 'blackie']

In [67]:
for term in comparison_terms:
    observed_expected.append(hclc(term))

In [68]:
observed_expected

[(0, 2), (1, 0), (0, 3), (1, 4), (0, 1)]

# expected counts and chi-squared

In [79]:
from scipy.stats import chisquare

In [73]:
high_value_count = df.high_value.value_counts()[1]

In [74]:
low_value_count = df.high_value.value_counts()[0]

In [90]:
chi_squared = []

In [91]:
for tup in observed_expected:
    total = tup[0] + tup[1]
    total_prop = total / df.shape[0]
    exp_h = total_prop * high_value_count
    exp_l = total_prop * low_value_count
    chi_squared.append(chisquare(f_obs = [tup[0], tup[1]], f_exp = [exp_h, exp_l]))
                                          

In [92]:
chi_squared[:5]

[Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=1.205888538380652, pvalue=0.27214791766901714),
 Power_divergenceResult(statistic=0.18383953104516373, pvalue=0.6680941623250602),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)]

# chi-squared results

None of the terms that we examined had a statistically significant different in appearance between high and low value questions. Also, with low frequencies, the validity of our chi-squared conclusion is questionable. A topic for further exploration could inclusion a chi-squared analysis only on words with high frequencies.