##  Python probability and statistics using chi-squared tests

We will be working on jeopardy dataset to find out patterns in the questions that could help to win. Each row in the dataset represents a single question on a single episode of Jeopardy. Here are explanations of each column:

Show Number -- the Jeopardy episode number of the show this question was in.

Air Date -- the date the episode aired.

Round -- the round of Jeopardy that the question was asked in. Jeopardy has several rounds as each episode progresses.

Category -- the category of the question.

Value -- the number of dollars answering the question correctly is worth.

Question -- the text of the question.

Answer -- the text of the answer.




In [2]:
import pandas as pd

jeopardy = pd.read_csv("Probability_and_STatistics_in_Python_intermediate\\jeopardy.csv")


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
# strip spaces from the front of the columns
cols = jeopardy.columns.str.strip()
jeopardy.columns = cols
jeopardy.columns
jeopardy[['Question','Value','Air Date']].head(20)


Unnamed: 0,Question,Value,Air Date
0,"For the last 8 years of his life, Galileo was ...",$200,2004-12-31
1,No. 2: 1912 Olympian; football star at Carlisl...,$200,2004-12-31
2,The city of Yuma in this state has a record av...,$200,2004-12-31
3,"In 1963, live on ""The Art Linkletter Show"", th...",$200,2004-12-31
4,"Signer of the Dec. of Indep., framer of the Co...",$200,2004-12-31
5,"In the title of an Aesop fable, this insect sh...",$200,2004-12-31
6,Built in 312 B.C. to link Rome & the South of ...,$400,2004-12-31
7,"No. 8: 30 steals for the Birmingham Barons; 2,...",$400,2004-12-31
8,"In the winter of 1971-72, a record 1,122 inche...",$400,2004-12-31
9,This housewares store was named for the packag...,$400,2004-12-31


In [5]:
# Normalize the columns
def normalize(series):
    import string
    translation = str.maketrans('','',string.punctuation)
    return series.translate(translation).lower()

jeopardy['clean_question']= jeopardy['Question'].apply(normalize)
jeopardy['clean_answer']= jeopardy['Answer'].apply(normalize)


In [6]:
# Normalize and remove dollar sign
def normalize_dollar(series):
    import string
    translation = str.maketrans('','',string.punctuation)
    lower_str =  series.translate(translation).lower()
    replace_str = lower_str.replace('$','')
    try:
        final_result = int(replace_str)

    except:
        final_result = 0
    return final_result    

jeopardy['clean_value']= jeopardy['Value'].apply(normalize_dollar)
jeopardy['clean_value'].head(10)


0    200
1    200
2    200
3    200
4    200
5    200
6    400
7    400
8    400
9    400
Name: clean_value, dtype: int64

In [7]:
# convert string to datetime


In [8]:
# convert string to datetime
from datetime import datetime
jeopardy['date'] = jeopardy['Air Date'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
jeopardy['date'].head()

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: date, dtype: datetime64[ns]

In [9]:
# How often answer is deductible from question

def prob_answer(row):
    split_question = row["clean_question"].split(" ")
    split_answer = row["clean_answer"].split(" ")
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for i in split_answer:
        if i in split_question:
            match_count += 1
    return match_count/len(split_answer)   


jeopardy['answer_in_question'] = jeopardy.apply(prob_answer,axis=1)
mean = jeopardy['answer_in_question'].mean()
mean
    


0.060352773854699004

In [26]:
# How often new questions are repeats of older questions.

question_overlap = []
terms_used = set()

for i,row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
     
    split_question = [i for i in split_question if len(i) > 5]
    match_count = 0
    for i in split_question:
        if i in terms_used:
            match_count += 1
    for word in split_question:
            terms_used.add(word) 
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
        
jeopardy["question_overlap"] = question_overlap        
print(list(terms_used)[:5])



['combined', 'salmond', 'sweaty', 'superstar', 'tourism']


In [12]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,date,answer_in_question,question_overlap
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200,2004-12-31,0.0,0.0
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200,2004-12-31,0.0,0.0
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200,2004-12-31,0.0,0.0
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200,2004-12-31,0.0,0.0
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200,2004-12-31,0.0,0.0


In [15]:
high_value = jeopardy[jeopardy.clean_value >= 800]

In [32]:
# identify low value and high value words
def find_value(row):
    if row["clean_value"] > 800:
        return 1
    else:
        return 0
jeopardy["high_value"] =jeopardy.apply(find_value,axis=1)

def find_word(word):
    low_count = 0
    high_count = 0
    for i,row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count +=1
            else:
                low_count +=1
                
    return high_count,low_count

observed_expected =[]
comparision_terms = list(terms_used)[:5]

for term in comparision_terms:
    count =  find_word(term)
    observed_expected.append(count)
    
    
print(observed_expected)


[(2, 7), (0, 1), (0, 1), (1, 9), (1, 0)]


In [39]:
high_rows = jeopardy[jeopardy["high_value"]==1].shape[0]
low_rows = jeopardy[jeopardy["high_value"]==0].shape[0]
print(low_rows)

14265


In [45]:
from scipy.stats import chisquare
import numpy as np

chi_squared = []
for i in observed_expected:
    total_count = i[0]+i[1]
    total_prop = total_count/jeopardy.shape[0]
    high_value_exp = total_prop * high_rows
    low_value_exp = total_prop * low_rows
    
    observed = np.array([i[0],i[1]])
    expected = np.array([high_value_exp,low_value_exp])
    chi_squared.append(chisquare(observed, expected))
    
    
print(chi_squared)
    



    

[Power_divergenceResult(statistic=0.18303865877777942, pvalue=0.66877476612797593), Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686), Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686), Power_divergenceResult(statistic=1.7046782653473278, pvalue=0.19167729675916576), Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047)]
