In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords

In [2]:
jeopardy = pd.read_csv('JEOPARDY_CSV.csv')
print(jeopardy.shape)
jeopardy.columns=jeopardy.columns.str.strip()
print(jeopardy.columns)

(216930, 7)
Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [3]:
def remove_punctuation(x):
    table = str.maketrans({key: None for key in string.punctuation})
    return x.translate(table)
def str_norm(s):
    s=s.lower()
    s=remove_punctuation(s)
    return s
jeopardy['clean_question']=jeopardy['Question'].apply(str_norm)
jeopardy['Answer']=jeopardy['Answer'].astype(str)
jeopardy['clean_answer']=jeopardy['Answer'].apply(str_norm)
print(jeopardy.head())

   Show Number    Air Date      Round                         Category Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY  $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...  $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE  $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES  $200   

                                            Question      Answer  \
0  For the last 8 years of his life, Galileo was ...  Copernicus   
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe   
2  The city of Yuma in this state has a record av...     Arizona   
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's   
4  Signer of the Dec. of Indep., framer of the Co...  John Adams   

                                      clean_question clean_answer  
0  for the last 8 years of his life galile

In [4]:
def dollar_norm(s):
    s = remove_punctuation(s)
    try:
        i = int(s)
    except:
        i = 0
    return i
jeopardy['clean_value']=jeopardy['Value'].apply(dollar_norm)
jeopardy['Air Date']=pd.to_datetime(jeopardy['Air Date'])
print(jeopardy.head())

   Show Number   Air Date      Round                         Category Value  \
0         4680 2004-12-31  Jeopardy!                          HISTORY  $200   
1         4680 2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200   
2         4680 2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...  $200   
3         4680 2004-12-31  Jeopardy!                 THE COMPANY LINE  $200   
4         4680 2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES  $200   

                                            Question      Answer  \
0  For the last 8 years of his life, Galileo was ...  Copernicus   
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe   
2  The city of Yuma in this state has a record av...     Arizona   
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's   
4  Signer of the Dec. of Indep., framer of the Co...  John Adams   

                                      clean_question clean_answer  clean_value  
0  for the last 8 years of his life

The next analysis looks into the possibility of finding answers in the questions.

In [5]:
def answer_in_question(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0
    split_answer = [c for c in split_answer if c != 'the']
    if len(split_answer) == 0:
        return 0
    else:
        for i in split_answer:
            if i in split_question:
                match_count += 1
    return match_count/len(split_answer)
jeopardy['answer_in_question']=jeopardy.apply(answer_in_question, axis=1)
print(jeopardy['answer_in_question'].mean())       
    

0.05729672120956456


On average, 5.7% of times the answer is deducible from the question.

The next analysis examines the probability of repeated terms in the questions. 

In [6]:
question_overlap = []
terms_used = set()
jeopardy = jeopardy.sort_values('Air Date')

stop_words = set(stopwords.words('english'))

def question_overlap(row):
    split_question = row['clean_question'].split()
    split_question = [c for c in split_question if c not in stop_words]
    match_count = 0
    for i in split_question:        
        if i in terms_used:
            match_count += 1
        terms_used.add(i)
    if len(split_question)>0:
        return match_count / len(split_question)       
    else:
        return 0
jeopardy['question_overlap'] = jeopardy.apply(question_overlap,axis=1)
print(jeopardy['question_overlap'].mean())
    

0.9281511234778309


On average, 92% of times the words in a new questions repeats from the old questions. However, single words don't capture the whole context of the question well. should use phrases instead of single words when seeing if there's overlap between questions. 

The next analysis investigates whether some term overlaps more in high_value questions than low_value questions.

In [7]:
def cate_value(row):
    if row['clean_value']>800:
        value = 1
    else:
        value = 0
    return value

jeopardy['high_value'] = jeopardy.apply(cate_value,axis=1)

In [8]:
def cate_word(row, word):
    split_question = row['clean_question'].split()
    if word in split_question:
        if row['high_value']==1:
            high_count = 1
        else:
            high_count = 0
    else:
        return float('nan')
    return high_count
observed_expected =[]
comparison_terms = list(terms_used)[0:5]
print(comparison_terms)


['nandi', 'punahou', 'toinsult', 'clancys', 'cb']


In [9]:
for term in comparison_terms:
    jeopardy['high_count']=jeopardy.apply(lambda x: cate_word(x, term), axis=1)
    count_series = jeopardy['high_count'].value_counts()
    try:
        high_count = count_series.loc[1.0]
    except:
        high_count = 0
    try:
        low_count = count_series.loc[0.0]
    except:
        low_count = 0
    observed_expected.append([high_count,low_count])

In [10]:
from scipy.stats import chisquare
high_value_count = jeopardy[jeopardy['high_value']==1].shape[0]
low_value_count = jeopardy[jeopardy['high_value']==0].shape[0]
chi_squared = []
print(observed_expected)
for i in observed_expected:
    total = i[0]+i[1]
    total_prop = total / jeopardy.shape[0]
    high_val_expected = total_prop * high_value_count
    low_val_expected = total_prop * low_value_count
    chisquare_val, p_val = chisquare(i,[high_val_expected,low_val_expected])
    print(chisquare_val, p_val)
    chi_squared.append(chisquare_val)

[[0, 1], [0, 1], [0, 1], [0, 3], [1, 8]]
0.394976464233 0.529695091249
0.394976464233 0.529695091249
0.394976464233 0.529695091249
1.1849293927 0.276354749133
1.31225445952 0.251986878548


use Chi-square test to figure out which terms correspond to high-value questions.the null hypothesis is the count of specific repeated term in high value questions is similar to the count of the term repeated in low value questions. I tested 5 terms and none of them rejected the null hypothesis. 

In [14]:
print(jeopardy['Category'].value_counts().head())

BEFORE & AFTER      547
SCIENCE             519
LITERATURE          496
AMERICAN HISTORY    418
POTPOURRI           401
Name: Category, dtype: int64


The above are 5 most common categories of the jeopardy question database.