# Jeopardy Questions
    

    
    

In [1]:
import pandas as pd

jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

## Cleaning datas

In [3]:
import re

new_name = []
for string in jeopardy.columns:
    new = re.sub("^\s+|\s+$", "", string)
    new_name.append(new)

jeopardy.columns = new_name
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

### Normalize text 

In [4]:
import string

def normalize_text(text) :
    text = text.lower()
    text = ' '.join(word.strip(string.punctuation) for word in text.split())
    text = re.sub("^\s+|\s+$|\s\s+", "", text)
    return text

jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)

In [5]:
jeopardy['clean_question'].head()

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [6]:
jeopardy['clean_answer'].head()

0    copernicus
1    jim thorpe
2       arizona
3    mcdonald's
4    john adams
Name: clean_answer, dtype: object

### Convert into numerical values 

In [7]:
import re

def normlalize_value(text):
        text = ' '.join(word.strip(string.punctuation) for word in text.split())
        text = re.sub("[$€]","",text)
        try:
            value = int(text)
        except Exception:
            value = 0
        return value 

In [8]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normlalize_value)
jeopardy['clean_value'].head(20)

0     200
1     200
2     200
3     200
4     200
5     200
6     400
7     400
8     400
9     400
10    400
11    400
12    600
13    600
14    600
15    600
16    600
17    600
18    800
19    800
Name: clean_value, dtype: int64

In [None]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [None]:
jeopardy.head(7)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonald's,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant,200
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 b.c to link romethe south of ital...,the appian way,400


In [None]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

# Questions
* How often the answer is deducible from the question ?
* How often new questions are repeats of older questions ?

**Checking if a word in the question is in the answer**  
*except "the, a, what, where"*

In [None]:
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    extra_word = ["the", "a", "what", "where"]
    for term in extra_word :
        if term in split_answer:
            split_answer.remove(term)
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return (match_count / len(split_answer))

jeopardy["answer_in_question"]  = jeopardy.apply(count_matches, axis=1)

### Average words corresponding amout all

In [None]:
jeopardy["answer_in_question"].mean()

### Average of questions having a correspondance

In [None]:
len(jeopardy[jeopardy["answer_in_question"] != 0])/len(jeopardy)

In [None]:
len(jeopardy[jeopardy["answer_in_question"] >= 0.1])/len(jeopardy)

In [None]:
len(jeopardy[jeopardy["answer_in_question"] >= 0.5])/len(jeopardy)

In [None]:
jeopardy.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

percents = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
results = []
for perc in percents:
    min = jeopardy["answer_in_question"] > (perc - 0.1)
    max = jeopardy["answer_in_question"] <= perc
    results.append(len(jeopardy[max & min])/len(jeopardy))

x_label = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
plt.bar(percents, results, width=0.1, align='edge', tick_label=x_label )
plt.show()

We can observe that there is 5% of responses holding 0.4% of commun words. Wich is a bordeline result regarding players help to find answers.

# Questions overlap

In [None]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        split_question = [q for q in split_question if len(q) > 5]
        match_count = 0
        for word in split_question:
            if word in terms_used:
                match_count += 1
        for word in split_question:
            terms_used.add(word)
        if len(split_question) > 0:
            match_count /= len(split_question)
        question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()

In [None]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        split_question = [q for q in split_question if len(q) > 5]
        match_count = 0
        for word in split_question:
            if word in terms_used:
                match_count += 1
        for word in split_question:
            terms_used.add(word)
        if len(split_question) > 0:
            match_count /= len(split_question)
        question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()