1. Load libraries needed for this module's work

In [22]:
import json
data=json.load(open('jeopardy.json'))

In [51]:
import pandas as pd
import os
import numpy as np
import nltk 
import sklearn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [65]:
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


True

2. Looking at the Data

In [42]:
with open('jeopardy.json') as f:
    data = json.load(f)

# Convert the data to a Pandas DataFrame
df = pd.DataFrame(data)


In [43]:
df

Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680
...,...,...,...,...,...,...,...
216925,RIDDLE ME THIS,2006-05-11,'This Puccini opera turns on the solution to 3...,$2000,Turandot,Double Jeopardy!,4999
216926,"""T"" BIRDS",2006-05-11,'In North America this term is properly applie...,$2000,a titmouse,Double Jeopardy!,4999
216927,AUTHORS IN THEIR YOUTH,2006-05-11,"'In Penny Lane, where this ""Hellraiser"" grew u...",$2000,Clive Barker,Double Jeopardy!,4999
216928,QUOTATIONS,2006-05-11,"'From Ft. Sill, Okla. he made the plea, Arizon...",$2000,Geronimo,Double Jeopardy!,4999


3. Prepping the Data

Drop category, air_date, question, and show_number from df

In [44]:
columns_to_drop = ['category', 'air_date', 'answer', 'show_number']
df = df.drop(columns_to_drop, axis=1)


In [45]:
df

Unnamed: 0,question,value,round
0,"'For the last 8 years of his life, Galileo was...",$200,Jeopardy!
1,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jeopardy!
2,'The city of Yuma in this state has a record a...,$200,Jeopardy!
3,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,Jeopardy!
4,"'Signer of the Dec. of Indep., framer of the C...",$200,Jeopardy!
...,...,...,...
216925,'This Puccini opera turns on the solution to 3...,$2000,Double Jeopardy!
216926,'In North America this term is properly applie...,$2000,Double Jeopardy!
216927,"'In Penny Lane, where this ""Hellraiser"" grew u...",$2000,Double Jeopardy!
216928,"'From Ft. Sill, Okla. he made the plea, Arizon...",$2000,Double Jeopardy!


Convert to lower case.

In [46]:
columns_to_lower = ['question', 'value', 'round']
for col in columns_to_lower:
    df[col] = df[col].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [57]:
df

Unnamed: 0,question,value,round,question_tokens,value_tokens,round_tokens
0,"'for the last 8 years of his life, galileo was...",$200,jeopardy!,"['for, the, last, 8, years, of, his, life, ,, ...","[$, 200]","[jeopardy, !]"
1,'no. 2: 1912 olympian; football star at carlis...,$200,jeopardy!,"['no, ., 2, :, 1912, olympian, ;, football, st...","[$, 200]","[jeopardy, !]"
2,'the city of yuma in this state has a record a...,$200,jeopardy!,"['the, city, of, yuma, in, this, state, has, a...","[$, 200]","[jeopardy, !]"
3,"'in 1963, live on ""the art linkletter show"", t...",$200,jeopardy!,"['in, 1963, ,, live, on, ``, the, art, linklet...","[$, 200]","[jeopardy, !]"
4,"'signer of the dec. of indep., framer of the c...",$200,jeopardy!,"['signer, of, the, dec., of, indep., ,, framer...","[$, 200]","[jeopardy, !]"
...,...,...,...,...,...,...
216925,'this puccini opera turns on the solution to 3...,$2000,double jeopardy!,"['this, puccini, opera, turns, on, the, soluti...","[$, 2000]","[double, jeopardy, !]"
216926,'in north america this term is properly applie...,$2000,double jeopardy!,"['in, north, america, this, term, is, properly...","[$, 2000]","[double, jeopardy, !]"
216927,"'in penny lane, where this ""hellraiser"" grew u...",$2000,double jeopardy!,"['in, penny, lane, ,, where, this, ``, hellrai...","[$, 2000]","[double, jeopardy, !]"
216928,"'from ft. sill, okla. he made the plea, arizon...",$2000,double jeopardy!,"['from, ft., sill, ,, okla., he, made, the, pl...","[$, 2000]","[double, jeopardy, !]"


Clean the missing values.

In [54]:
df = df.fillna('Unknown')


Tokenize the words.

In [55]:
def tokenize_text(text):
    return nltk.word_tokenize(text)


df['question_tokens'] = df['question'].apply(tokenize_text)
df['value_tokens'] = df['value'].apply(tokenize_text)
df['round_tokens'] = df['round'].apply(tokenize_text)


In [58]:
df

Unnamed: 0,question,value,round,question_tokens,value_tokens,round_tokens
0,"'for the last 8 years of his life, galileo was...",$200,jeopardy!,"['for, the, last, 8, years, of, his, life, ,, ...","[$, 200]","[jeopardy, !]"
1,'no. 2: 1912 olympian; football star at carlis...,$200,jeopardy!,"['no, ., 2, :, 1912, olympian, ;, football, st...","[$, 200]","[jeopardy, !]"
2,'the city of yuma in this state has a record a...,$200,jeopardy!,"['the, city, of, yuma, in, this, state, has, a...","[$, 200]","[jeopardy, !]"
3,"'in 1963, live on ""the art linkletter show"", t...",$200,jeopardy!,"['in, 1963, ,, live, on, ``, the, art, linklet...","[$, 200]","[jeopardy, !]"
4,"'signer of the dec. of indep., framer of the c...",$200,jeopardy!,"['signer, of, the, dec., of, indep., ,, framer...","[$, 200]","[jeopardy, !]"
...,...,...,...,...,...,...
216925,'this puccini opera turns on the solution to 3...,$2000,double jeopardy!,"['this, puccini, opera, turns, on, the, soluti...","[$, 2000]","[double, jeopardy, !]"
216926,'in north america this term is properly applie...,$2000,double jeopardy!,"['in, north, america, this, term, is, properly...","[$, 2000]","[double, jeopardy, !]"
216927,"'in penny lane, where this ""hellraiser"" grew u...",$2000,double jeopardy!,"['in, penny, lane, ,, where, this, ``, hellrai...","[$, 2000]","[double, jeopardy, !]"
216928,"'from ft. sill, okla. he made the plea, arizon...",$2000,double jeopardy!,"['from, ft., sill, ,, okla., he, made, the, pl...","[$, 2000]","[double, jeopardy, !]"


Lemmatization

In [60]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to lemmatize the text
def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

df['question_lemmatized'] = df['question'].apply(lemmatize_text)
df['value_lemmatized'] = df['value'].apply(lemmatize_text)  
df['round_lemmatized'] = df['round'].apply(lemmatize_text)

Remove punctuation and special characters.

In [62]:
df['question'] = df['question'].str.replace(r'[^\w\s]', '')
df['value'] = df['value'].str.replace(r'[^\w\s]', '')
df['round'] = df['round'].str.replace(r'[^\w\s]', '')

Remove stopwords. - also add unknown since above

In [73]:
english_stopwords = set(stopwords.words('english') + list('punctuation') + ['..', '...', '....', "''", '//n'])

stop_words = english_stopwords
stop_words.add('unknown')  # Add 'unknown' to the stopwords

# Define a function to remove stopwords
def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    return ' '.join(tokens)

# Apply the function to the question column
df['clean_question'] = df['question'].apply(remove_stopwords)
df['clean_value'] = df['value'].apply(remove_stopwords)
df['clean_round'] = df['round'].apply(remove_stopwords)

4. Extracting Features

Create a dictionary to correlate value levels to difficulty levels. (E.g., $200 =1)

In [92]:
import re

# Normalize 'value' to an integer (e.g. "$200" -> 200, "unknown" -> None)
def normalize_value(v):
    if v is None:
        return None
    s = str(v)
    digits = re.sub(r'\D', '', s)   # remove non-digits
    return int(digits) if digits else None

# Map integer dollar values to difficulty levels (use integers as keys)
dollar_value_map = {
    200: 1,
    400: 2,
    600: 3,
    800: 4,
    1000: 5,
    1200: 6,
    1600: 7,
    2000: 8,
}

# Map cleaned numeric values to 1-9 difficulty levels (fallback to 9)
df['DiffLevelVal'] = df['value'].apply(normalize_value).map(dollar_value_map).fillna(9).astype(int)


Creates a dictionary to map rounds to higher and lower difficulty levels

In [108]:

round_type_map = { 
    'jeopardy!': 1, 
    'double jeopardy!': 2, 
    'final jeopardy!': 3 
}

df['DiffLevelRound'] = df['round'].apply(lambda x: round_type_map.get(x) if x else None)


In [99]:
# Create a dictionary to map round types to numerical values
round_type_map = {
    'Jeopardy!': 1,
    'Double Jeopardy!': 2,
    'Final Jeopardy!': 3
}

df['DiffLevelRound'] = df['round'].map(round_type_map)

In [109]:
df

Unnamed: 0,question,value,round,question_tokens,value_tokens,round_tokens,question_lemmatized,value_lemmatized,round_lemmatized,clean_question,clean_value,clean_round,DiffLevelVal,DiffLevelRound
0,"'for the last 8 years of his life, galileo was...",$200,jeopardy!,"['for, the, last, 8, years, of, his, life, ,, ...","[$, 200]","[jeopardy, !]","'for the last 8 year of his life , galileo wa ...",$ 200,jeopardy !,"'for last 8 years life , galileo house arrest ...",$ 200,jeopardy !,1,1.0
1,'no. 2: 1912 olympian; football star at carlis...,$200,jeopardy!,"['no, ., 2, :, 1912, olympian, ;, football, st...","[$, 200]","[jeopardy, !]",'no . 2 : 1912 olympian ; football star at car...,$ 200,jeopardy !,'no . 2 : 1912 olympian ; football star carlis...,$ 200,jeopardy !,1,1.0
2,'the city of yuma in this state has a record a...,$200,jeopardy!,"['the, city, of, yuma, in, this, state, has, a...","[$, 200]","[jeopardy, !]",'the city of yuma in this state ha a record av...,$ 200,jeopardy !,"'the city yuma state record average 4,055 hour...",$ 200,jeopardy !,1,1.0
3,"'in 1963, live on ""the art linkletter show"", t...",$200,jeopardy!,"['in, 1963, ,, live, on, ``, the, art, linklet...","[$, 200]","[jeopardy, !]","'in 1963 , live on `` the art linkletter show ...",$ 200,jeopardy !,"'in 1963 , live `` art linkletter show , compa...",$ 200,jeopardy !,1,1.0
4,"'signer of the dec. of indep., framer of the c...",$200,jeopardy!,"['signer, of, the, dec., of, indep., ,, framer...","[$, 200]","[jeopardy, !]","'signer of the dec. of indep. , framer of the ...",$ 200,jeopardy !,"'signer dec. indep. , framer constitution mass...",$ 200,jeopardy !,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216925,'this puccini opera turns on the solution to 3...,$2000,double jeopardy!,"['this, puccini, opera, turns, on, the, soluti...","[$, 2000]","[double, jeopardy, !]",'this puccini opera turn on the solution to 3 ...,$ 2000,double jeopardy !,'this puccini opera turns solution 3 riddles p...,$ 2000,double jeopardy !,8,2.0
216926,'in north america this term is properly applie...,$2000,double jeopardy!,"['in, north, america, this, term, is, properly...","[$, 2000]","[double, jeopardy, !]",'in north america this term is properly applie...,$ 2000,double jeopardy !,'in north america term properly applied 4 spec...,$ 2000,double jeopardy !,8,2.0
216927,"'in penny lane, where this ""hellraiser"" grew u...",$2000,double jeopardy!,"['in, penny, lane, ,, where, this, ``, hellrai...","[$, 2000]","[double, jeopardy, !]","'in penny lane , where this `` hellraiser '' g...",$ 2000,double jeopardy !,"'in penny lane , `` hellraiser grew , barber s...",$ 2000,double jeopardy !,8,2.0
216928,"'from ft. sill, okla. he made the plea, arizon...",$2000,double jeopardy!,"['from, ft., sill, ,, okla., he, made, the, pl...","[$, 2000]","[double, jeopardy, !]","'from ft. sill , okla. he made the plea , ariz...",$ 2000,double jeopardy !,"'from ft. sill , okla. made plea , arizona lan...",$ 2000,double jeopardy !,8,2.0


Add these to the data frame together

In [120]:
# Create a new column that combines the value and round type
df['combined_value'] = df['DiffLevelVal'] + (df['DiffLevelRound'])

In [121]:
df

Unnamed: 0,question,value,round,question_tokens,value_tokens,round_tokens,question_lemmatized,value_lemmatized,round_lemmatized,clean_question,clean_value,clean_round,DiffLevelVal,DiffLevelRound,combined_value
0,"'for the last 8 years of his life, galileo was...",$200,jeopardy!,"['for, the, last, 8, years, of, his, life, ,, ...","[$, 200]","[jeopardy, !]","'for the last 8 year of his life , galileo wa ...",$ 200,jeopardy !,"'for last 8 years life , galileo house arrest ...",$ 200,jeopardy !,1,1.0,2.0
1,'no. 2: 1912 olympian; football star at carlis...,$200,jeopardy!,"['no, ., 2, :, 1912, olympian, ;, football, st...","[$, 200]","[jeopardy, !]",'no . 2 : 1912 olympian ; football star at car...,$ 200,jeopardy !,'no . 2 : 1912 olympian ; football star carlis...,$ 200,jeopardy !,1,1.0,2.0
2,'the city of yuma in this state has a record a...,$200,jeopardy!,"['the, city, of, yuma, in, this, state, has, a...","[$, 200]","[jeopardy, !]",'the city of yuma in this state ha a record av...,$ 200,jeopardy !,"'the city yuma state record average 4,055 hour...",$ 200,jeopardy !,1,1.0,2.0
3,"'in 1963, live on ""the art linkletter show"", t...",$200,jeopardy!,"['in, 1963, ,, live, on, ``, the, art, linklet...","[$, 200]","[jeopardy, !]","'in 1963 , live on `` the art linkletter show ...",$ 200,jeopardy !,"'in 1963 , live `` art linkletter show , compa...",$ 200,jeopardy !,1,1.0,2.0
4,"'signer of the dec. of indep., framer of the c...",$200,jeopardy!,"['signer, of, the, dec., of, indep., ,, framer...","[$, 200]","[jeopardy, !]","'signer of the dec. of indep. , framer of the ...",$ 200,jeopardy !,"'signer dec. indep. , framer constitution mass...",$ 200,jeopardy !,1,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216925,'this puccini opera turns on the solution to 3...,$2000,double jeopardy!,"['this, puccini, opera, turns, on, the, soluti...","[$, 2000]","[double, jeopardy, !]",'this puccini opera turn on the solution to 3 ...,$ 2000,double jeopardy !,'this puccini opera turns solution 3 riddles p...,$ 2000,double jeopardy !,8,2.0,10.0
216926,'in north america this term is properly applie...,$2000,double jeopardy!,"['in, north, america, this, term, is, properly...","[$, 2000]","[double, jeopardy, !]",'in north america this term is properly applie...,$ 2000,double jeopardy !,'in north america term properly applied 4 spec...,$ 2000,double jeopardy !,8,2.0,10.0
216927,"'in penny lane, where this ""hellraiser"" grew u...",$2000,double jeopardy!,"['in, penny, lane, ,, where, this, ``, hellrai...","[$, 2000]","[double, jeopardy, !]","'in penny lane , where this `` hellraiser '' g...",$ 2000,double jeopardy !,"'in penny lane , `` hellraiser grew , barber s...",$ 2000,double jeopardy !,8,2.0,10.0
216928,"'from ft. sill, okla. he made the plea, arizon...",$2000,double jeopardy!,"['from, ft., sill, ,, okla., he, made, the, pl...","[$, 2000]","[double, jeopardy, !]","'from ft. sill , okla. he made the plea , ariz...",$ 2000,double jeopardy !,"'from ft. sill , okla. made plea , arizona lan...",$ 2000,double jeopardy !,8,2.0,10.0


5. Splitting the Data

In [114]:
# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(df['DiffLevelVal'], df['DiffLevelRound'], test_size=0.2, random_state=42)


6. Convert to numerical format use TF-IDF to extract features

In [116]:
# choose a text column as features and the numeric difficulty as labels
X = df['clean_question']            # or 'question_lemmatized' / 'question'
y = df['DiffLevelVal']              # or DiffLevelRound if that's the target

# ensure no None values
X = X.fillna('')
y = y.fillna(9).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train.astype(str))
X_test_vectors  = vectorizer.transform(X_test.astype(str))


6. Training the Module

Use MultinomialNB to training the model with the training data

In [117]:
# Train a Multinomial Naive Bayesian classifier
clf = MultinomialNB()
clf.fit(X_train_vectors, y_train)


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


7. Evaluating the Module

Look at Accuracy

In [118]:
# Evaluate the model
y_pred = clf.predict(X_test_vectors)
print("Accuracy:", clf.score(X_test_vectors, y_test))

Accuracy: 0.19517816807265018


Finishing Up

In [123]:
# Save DataFrame to CSV
df.to_csv('data.csv', index=False)


In [127]:
# Create a DataFrame with the predictions and actual labels
predictions_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

# Save the DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=False)


