In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline
plt.rcParams['figure.figsize'] = (8.0, 5.5)

In [2]:
!ls -la

total 3168
drwxr-xr-x  3 quang quang    4096 Dec 19 21:16 .
drwxr-xr-x 16 quang quang    4096 Dec 19 19:13 ..
-rwxrwxr-x  1 quang quang   13287 Dec 19 21:16 Basics.ipynb
drwxr-xr-x  2 quang quang    4096 Dec 19 19:08 .ipynb_checkpoints
-rwxr-xr-x  1 quang quang 3213140 Dec 14 18:10 jeopardy.csv


In [3]:
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
jeopardy.columns = [item.strip() for item in jeopardy.columns]

In [6]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [7]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
Air Date       19999 non-null object
Round          19999 non-null object
Category       19999 non-null object
Value          19999 non-null object
Question       19999 non-null object
Answer         19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


Write a function to normalize questions and answers. It should:
- Take in a string.
- Convert the string to lowercase.
- Remove all punctuation in the string.
- Return the string.

In [8]:
def normalize_string(string):    
    string = string.lower()
    string = re.sub("[\~\!\@\#\$\%\^\&\*\(\)\_\+\{\}\[\]\:\;\"\|\<\>\,\.\/\/\?\\\+\-]", "", string)
    return string

In [9]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_string)

In [10]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_string)

Write a function to normalize dollar values. It should:
- Take in a string.
- Remove any punctuation in the string.
- Convert the string to an integer.
- If the conversion has an error, assign 0 instead.
- Return the integer.

In [11]:
def normalize_dollar(string):
    string = normalize_string(string)
    try:
        return int(string)
    except:
        return 0

In [12]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_dollar)
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [13]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 10 columns):
Show Number       19999 non-null int64
Air Date          19999 non-null datetime64[ns]
Round             19999 non-null object
Category          19999 non-null object
Value             19999 non-null object
Question          19999 non-null object
Answer            19999 non-null object
clean_question    19999 non-null object
clean_answer      19999 non-null object
clean_value       19999 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 1.5+ MB


In [14]:
jeopardy['Air Date'].head()

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]

In [15]:
def count_answer_in_question(row):
    match_count = sum([1 if (item in row['clean_question']) & (item != 'the') else 0 \
                        for item in row['clean_answer'].split()])
    if len(row['clean_answer'].split()) == 0:
        return 0
    else:
        return match_count / len(row['clean_answer'].split())

In [16]:
jeopardy['answer_in_question'] = jeopardy.apply(count_answer_in_question, axis=1)

In [17]:
jeopardy['answer_in_question'].mean()

0.07994510517511635

In [18]:
jeopardy = jeopardy.sort_values(by='Air Date')
question_overlap = []
terms_used = set([])
for _, row in jeopardy.iterrows():
    split_question = row['clean_question'].split()
    split_question = [item for item in split_question if (len(item) >= 6)]
    match_count = 0
    for item in split_question:
        if item in terms_used:
            match_count += 1
        terms_used.add(item)
    if len(split_question) > 0:
        question_overlap.append(match_count / len(split_question)) 
    else:
        question_overlap.append(0)
jeopardy['question_overlap'] = question_overlap

In [19]:
jeopardy['question_overlap'].mean()

0.6848223476273113