# Naive Bayesian classifier for Jeopardy! question data

Import the necessary modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import string

Convert the json file of Jeopardy! questions into a dataframe

In [2]:
df = pd.read_json('jeopardy.json')

Print the head of df

In [36]:
df.head()

Unnamed: 0,category,air_date,question,value,answer,round,show_number,target
0,HISTORY,2004-12-31,for the last year of his life galileo wa unde...,200,Copernicus,Jeopardy!,4680,low
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,no olympian footbal star at carlisl indian s...,200,Jim Thorpe,Jeopardy!,4680,low
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,the citi of yuma in this state ha a record ave...,200,Arizona,Jeopardy!,4680,low
3,THE COMPANY LINE,2004-12-31,in live on the art linklett show this compani...,200,McDonald\'s,Jeopardy!,4680,low
4,EPITAPHS & TRIBUTES,2004-12-31,signer of the dec of indep framer of the const...,200,John Adams,Jeopardy!,4680,low


print the number of rows and columns in the dataframe

In [4]:
df.shape

(216930, 7)

look for rows with missing values

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   category     216930 non-null  object
 1   air_date     216930 non-null  object
 2   question     216930 non-null  object
 3   value        213296 non-null  object
 4   answer       216930 non-null  object
 5   round        216930 non-null  object
 6   show_number  216930 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


drop the rows with missing values

In [6]:
df= df.dropna()

confirm the rows with missing data were dropped

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213296 entries, 0 to 216928
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   category     213296 non-null  object
 1   air_date     213296 non-null  object
 2   question     213296 non-null  object
 3   value        213296 non-null  object
 4   answer       213296 non-null  object
 5   round        213296 non-null  object
 6   show_number  213296 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 13.0+ MB


convert currency values into integers

In [8]:
df["value"] = df["value"].replace("[$,]", "", regex=True).astype(int)

find the median value between high and low

In [9]:
df.describe()

Unnamed: 0,value,show_number
count,213296.0,213296.0
mean,752.595923,4264.415943
std,637.855303,1386.153625
min,5.0,1.0
25%,400.0,3349.0
50%,600.0,4490.0
75%,1000.0,5393.0
max,18000.0,6300.0


create the binary labels for the target

In [10]:
df['target'] = np.where(df['value']>600,'high','low')

combine the textual fields into one column

In [37]:
df["text"] = df["category"] + df["question"] + df["answer"] + df["round"]

make the text lower case

In [38]:
df["text"] = df["text"].str.lower()

remove punctuation

In [39]:
df["text"] = df["text"].str.replace('[{}]'.format(string.punctuation), '')

remove numbers

In [40]:
df["text"] = df["text"].str.replace('[{}]'.format(string.digits), '')

create a stemming object

In [41]:
stemmer = SnowballStemmer('english')

create a stemming function

In [42]:
def stem_func(cell):
    stemmed = ' '.join([stemmer.stem(word) for word in cell.split(' ')])
    return stemmed

apply the stemming function to the question data

In [43]:
df["text"] = df["text"].apply(stem_func)

create a lemmatization object

In [44]:
lemmer = WordNetLemmatizer()

create a lemmatization function

In [45]:
def lemm_func(cell):
    lemmed = ' '.join([lemmer.lemmatize(word) for word in cell.split(' ')])
    return lemmed

apply the lemmatization function to the question data

In [46]:
df["text"] = df["text"].apply(lemm_func)

# Split for Validation

Create a series to store the labels: y

In [47]:
y = df.target

Create training and test sets

In [48]:
X_train, X_test, y_train, y_test = train_test_split(df["text"],y,test_size=0.33,random_state=53)

# TfidfVectorizer

The tf stands for term frequency, which counts the number of times that each word occurs in each row. The idf is an abbreviation for inverse document frequency, which means it gives less weight to common terms and more weight to rare terms. The column for each word becomes a vector in a matrix, with the documents forming its rows.

Initialize a TfidfVectorizer object: tfidf_vectorizer

In [75]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=21000)

Transform the training data: tfidf_train 

In [76]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

Transform the test data: tfidf_test 

In [77]:
tfidf_test = tfidf_vectorizer.transform(X_test)

Print the first 10 features

In [78]:
print(tfidf_vectorizer.get_feature_names()[:10])

['aa', 'aaron', 'aarondoubl', 'aaronjeopardi', 'ab', 'aba', 'abandon', 'abba', 'abbey', 'abbeydoubl']


In [79]:
tfidffeat = tfidf_vectorizer.get_feature_names()
print("There are",len(tfidffeat),"features in the tfidf vectorizer object for the train data")

There are 21000 features in the tfidf vectorizer object for the train data


Create the TfidfVectorizer DataFrame: tfidf_df

In [81]:
tfidf_df = pd.DataFrame(tfidf_train.A,
                        columns=tfidf_vectorizer.get_feature_names())

Print a sample of the rows in tfidf_df

In [82]:
tfidf_df.sample(frac=0.00008)

Unnamed: 0,aa,aaron,aarondoubl,aaronjeopardi,ab,aba,abandon,abba,abbey,abbeydoubl,...,zoologyin,zoologyth,zoologythi,zoom,zooth,zorba,zorro,zsa,zsit,zulu
41703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Classifier for TfidfVectorizer

This classifier uses a multinomial distribution, which calculates the probability of success given a certain number of categories and trials. The tfidf vector values give the classifier's naive bayes formula a better start at predicting which target label each row class belongs to.

Create a Multinomial Naive Bayes classifier: nb_classifier

In [83]:
nb_classifier = MultinomialNB()

Fit the classifier to the training data

In [84]:
nb_classifier.fit(tfidf_train, y_train)

MultinomialNB()

Create the predicted tags: pred

In [85]:
pred = nb_classifier.predict(tfidf_test)

Compute accuracy score

In [86]:
print ("accuracy score:",metrics.accuracy_score(y_test, pred))
print()

accuracy score: 0.6157867818378132



Calculate the confusion matrix and report

In [87]:
print ("confusion matrix:")
print (metrics.confusion_matrix(y_test, pred, labels=['high','low']))
print()
print ("classification report:")
print (metrics.classification_report(y_test, pred))

confusion matrix:
[[11095 19562]
 [ 7482 32249]]

classification report:
              precision    recall  f1-score   support

        high       0.60      0.36      0.45     30657
         low       0.62      0.81      0.70     39731

    accuracy                           0.62     70388
   macro avg       0.61      0.59      0.58     70388
weighted avg       0.61      0.62      0.59     70388

