In [1]:
import pandas as pd

jeopardy = pd.read_csv("jeopardy.csv")

jeopardy.head(5)
 


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [3]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [4]:
import re

def normalize(sent):
    sent = sent.lower()
    sent = re.sub("[^A-Za-z0-9\s]","",sent)
    return sent


jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)
    

In [5]:
def normalizeDollar(val):
    val = re.sub("[^A-Za-z0-9\s]","",val)
    try:
        res = int(val)   
    except Exception:
        res = 0   
    return res


jeopardy["clean_value"] = jeopardy["Value"].apply(normalizeDollar)   

In [6]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


In [7]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [8]:
def  findSimilarity(ser):
    split_answer = ser["clean_answer"].split(" ")
    split_question = ser["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
        
    if len(split_answer)==0:
        return 0
    for val in split_answer:
        if val in split_question:
            match_count  = match_count + 1
     
    return match_count/len(split_answer)
    

jeopardy["answer_in_question"] = jeopardy.apply(findSimilarity,axis = 1)

meanAnswerInQuestion =  jeopardy["answer_in_question"].mean()

In [9]:
print(meanAnswerInQuestion)

0.0604932570693


This means only 6 % of the times, answer appears in the question . This a very low figure and thus the technique cannot be relied on.

In [10]:
question_overlap = []
terms_used = set()

In [11]:
for i,row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [val for val in split_question if len(val)>=6]
    match_count= 0
    
    for word in split_question:
        if word in terms_used:
            match_count = match_count + 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    
    question_overlap.append(match_count)
    
jeopardy["question_overlap"] = question_overlap

meanval = jeopardy["question_overlap"].mean()          

In [12]:
print(meanval)

0.690873731567


This shows that there is around 69% of the terms in new questions appear in old questions also. This requires more analysis.

In [13]:
def seperateHighValQues(row):
    if row["clean_value"] > 800 :
        value = 1
    else:
        value = 0
    
    return value


jeopardy["high_value"] = jeopardy.apply(seperateHighValQues,axis=1)


In [14]:
def findCountInQuestions(word):
    low_count = 0
    high_count = 0
    
    for i, row in jeopardy.iterrows():
        words = row["clean_question"].split(" ")
        if word in words:
            if row["high_value"]==1:
                high_count += 1
            else:
                low_count += 1
     
    return high_count,low_count

observed_expected = []

comparison_terms = list(terms_used)[:5]

for term in comparison_terms:
    valueCount = findCountInQuestions(term)
    observed_expected.append(valueCount)
    
observed_expected

[(0, 2), (1, 2), (0, 1), (0, 2), (0, 1)]

In [16]:
from scipy.stats import chisquare 

import numpy as np


high_value_count = jeopardy[jeopardy["high_value"]==1].shape[0]

low_value_count = jeopardy[jeopardy["high_value"]==0].shape[0]

#total = high_value_count + low_value_count

chi_squared = []

for obs in observed_expected:
    total = sum(obs)
    total_prop = total/jeopardy.shape[0]
    expectedCountHighVal = total_prop * high_value_count
    expectedCountLowVal = total_prop * low_value_count
    
    observed = np.array([obs[0],obs[1]])
    expected = np.array([expectedCountHighVal,expectedCountLowVal])
    
    chi = chisquare(observed,expected)
    chi_squared.append(chi)
    
chi_squared 
    
    
    





[Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=0.031881167234403623, pvalue=0.85828871632352932),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]