In [1]:
import pandas as pd
print(pd.__version__)

0.19.2


### Import Data

In [2]:
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


### Clean Column Names

In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [5]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

### Normalize questions, answers, values, and dates

In [6]:
import string

def cleanString(st):
    st = st.translate(str.maketrans("","", string.punctuation)).lower()
    return st
    


In [7]:
jeopardy['clean_question'] = jeopardy.Question.apply(cleanString)
jeopardy['clean_answer'] = jeopardy.Answer.apply(cleanString)


In [8]:
jeopardy[['Question', 'clean_question', 'Answer', 'clean_answer']].sample(5)

Unnamed: 0,Question,clean_question,Answer,clean_answer
19043,"One of these may be cinch, money or tooled lea...",one of these may be cinch money or tooled leather,a belt,a belt
15593,"(<a href=""http://www.j-archive.com/media/2007-...",a hrefhttpwwwjarchivecommedia20070330j15jpg ta...,AmeriCorps,americorps
15888,In the 1840s Esther Howland made these cards f...,in the 1840s esther howland made these cards f...,Valentines,valentines
14909,"Ms. Gentileschi, the painter, rape victim, & m...",ms gentileschi the painter rape victim movie ...,Artemisia,artemisia
13002,It's a person who loves or collects books,its a person who loves or collects books,a bibliophile,a bibliophile


In [9]:
def normDolvals(st):
    st = st.translate(str.maketrans("","", string.punctuation))
    try:
        intst = int(st)
    except: 
        intst = 0
    return intst

print(normDolvals('$350'))
print(normDolvals('$noint'))


350
0


In [10]:
jeopardy['clean_value'] = jeopardy.Value.apply(normDolvals)

jeopardy[['Value', 'clean_value']].sample(5)

Unnamed: 0,Value,clean_value
19532,$400,400
14659,$1600,1600
11923,$400,400
5333,$400,400
13341,$400,400


In [11]:
jeopardy['clean_airdate'] = pd.to_datetime(jeopardy['Air Date'])

jeopardy[['Air Date', 'clean_airdate']].sample(5)

Unnamed: 0,Air Date,clean_airdate
7379,1998-10-30,1998-10-30
16527,2009-12-21,2009-12-21
1448,2006-03-17,2006-03-17
6177,2006-03-29,2006-03-29
18994,2009-07-02,2009-07-02


### Explore Whether the Answer is Contained in the Question

In [13]:
def countAnsInQ(row):
    split_answer = row.clean_answer.split(' ')
    split_question = row.clean_question.split(' ')
    
    match_count = 0
    
    try:
        split_answer.remove('the')
    except:
        pass
    
    if len(split_answer) == 0:
        return 0
    
    for each in split_answer:
        if each in split_question:
            match_count +=1
        else:
            pass
        
    return match_count/len(split_answer)
    
jeopardy['answer_in_question'] = jeopardy.apply(countAnsInQ, axis = 1)

print(jeopardy['answer_in_question'].mean())


0.0603527738547


The takeaway seems to be that answers are very infrequently in the questions themselves; this happens only about 6% of the time. 

### Explore whether Questions are Reused

In [20]:
question_overlap = []
terms_used = set()

jeopardy.sort_values(by='clean_airdate', inplace=True)

for index,row in jeopardy.iterrows():
    split_question = row.clean_question.split(' ')
    split_question = [i for i in split_question if len(i)>5]
    
    match_count = 0
    match_pct = 0
    for word in split_question:
        if word in terms_used:
            match_count +=1
        else:
            terms_used.add(word)
    
    if len(split_question) > 0:
        match_pct = match_count/len(split_question)
    else:
        pass
    
    question_overlap.append(match_pct)

jeopardy['question_overlap'] = question_overlap

print(jeopardy['question_overlap'].mean())

0.688905531662


It seems that about 70% of words in questions have appeared in previous jeopardy questions. Suggests that it might be well worth while to study the existing set of questions to prepare for future ones. 

### Explore High Value Questions 

In [21]:
def highLow(row):
    if row.clean_value > 800:
        value = 1
    else:
        value = 0
        
    return value

jeopardy['high_value'] = jeopardy.apply(highLow, axis = 1)

In [23]:
def countHighLow(word):
    low_count = 0
    high_count = 0
     
    for index, row in jeopardy.iterrows():
        split_question = row.clean_question.split(' ')
        if word in split_question:
            if row.high_value == 1:
                high_count +=1
            else:
                low_count += 1
        else:
            pass

    return high_count, low_count
        

In [27]:
observed_expected = []

comparison_terms = list(terms_used)[0:5]

print(comparison_terms)

for term in comparison_terms:
    observed_expected.append(countHighLow(term))
    
print(observed_expected)

['fictional', 'aerosmith', 'hanami', 'spared', 'trafficking']
[(5, 5), (0, 2), (0, 1), (1, 0), (0, 1)]


In [26]:
high_value_count = jeopardy[jeopardy.high_value == 1].shape[0]
low_value_count = jeopardy[jeopardy.high_value == 0].shape[0]
print(high_value_count)
print(low_value_count)


5734
14265


In [30]:
from scipy.stats import chisquare

chi_squared = []

for obs in observed_expected:
    total = obs[0]+obs[1]
    total_prop = total/jeopardy.shape[0]
    expHighValcount = total_prop * high_value_count
    expLowValcount = total_prop * low_value_count
    exp = [expHighValcount, expLowValcount]
    chsqrstat, pval = chisquare(obs,exp)
    chi_squared.append([chsqrstat, pval])

print(chi_squared)

[[2.2243874083063973, 0.13584652879916373], [0.80392569225376798, 0.36992223780795708], [0.40196284612688399, 0.52607729857054686], [2.4877921171956752, 0.11473257634454047], [0.40196284612688399, 0.52607729857054686]]


These comparison terms could have appeared in the high and low columns entirely by chance. This is because all the pvalues are much above 5%. 