# Naive Bayesian classifier for Jeopardy! question data

Import the necessary modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import string



Convert the json file of Jeopardy! questions into a dataframe

In [2]:
df = pd.read_json('jeopardy.json')

Print the head of df

In [3]:
df.head()

Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680


print the number of rows and columns in the dataframe

In [4]:
df.shape

(216930, 7)

look for rows with missing values

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   category     216930 non-null  object
 1   air_date     216930 non-null  object
 2   question     216930 non-null  object
 3   value        213296 non-null  object
 4   answer       216930 non-null  object
 5   round        216930 non-null  object
 6   show_number  216930 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


drop the rows with missing values

In [6]:
df= df.dropna()

confirm the rows with missing data were dropped

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213296 entries, 0 to 216928
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   category     213296 non-null  object
 1   air_date     213296 non-null  object
 2   question     213296 non-null  object
 3   value        213296 non-null  object
 4   answer       213296 non-null  object
 5   round        213296 non-null  object
 6   show_number  213296 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 13.0+ MB


convert currency values into integers

In [8]:
df["value"] = df["value"].replace("[$,]", "", regex=True).astype(int)

find the median value between high and low

In [9]:
df.describe()

Unnamed: 0,value,show_number
count,213296.0,213296.0
mean,752.595923,4264.415943
std,637.855303,1386.153625
min,5.0,1.0
25%,400.0,3349.0
50%,600.0,4490.0
75%,1000.0,5393.0
max,18000.0,6300.0


create the binary labels for the target

In [10]:
df['target'] = np.where(df['value']>600,'high','low')

combine the textual fields into one column

In [11]:
df["text"] = df["category"] + df["question"] + df["answer"] + df["round"]

make the text lower case

In [12]:
df["text"] = df["text"].str.lower()

remove punctuation

In [13]:
df["text"] = df["text"].str.replace('[{}]'.format(string.punctuation), '')

remove numbers

In [14]:
df["text"] = df["text"].str.replace('[{}]'.format(string.digits), '')

create a stemming object

In [15]:
stemmer = SnowballStemmer('english')

create a stemming function

In [16]:
def stem_func(cell):
    stemmed = ' '.join([stemmer.stem(word) for word in cell.split(' ')])
    return stemmed

apply the stemming function to the question data

In [17]:
df["text"] = df["text"].apply(stem_func)

create a lemmatization object

In [18]:
lemmer = WordNetLemmatizer()

create a lemmatization function

In [19]:
def lemm_func(cell):
    lemmed = ' '.join([lemmer.lemmatize(word) for word in cell.split(' ')])
    return lemmed

apply the lemmatization function to the question data

In [20]:
df["text"] = df["text"].apply(lemm_func)

# Split for Validation

Create a series to store the labels: y

In [21]:
y = df.target

Create training and test sets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df["text"],y,test_size=0.33,random_state=53)

# CountVectorizer

This method puts together a list of all the words in the train data. This list is called a vocabulary. Each row in the train data is called a document. This method next creates a dataframe that has the same number of rows as the train data, but each word will get its own column. At the intersection of each column and row in the dataframe is a count of the number of times the word for that column occured in the document for that row.

Initialize a CountVectorizer object: count_vectorizer

In [23]:
count_vectorizer = CountVectorizer(max_features=20000)

Transform the training data using only the 'text' column values: count_train 

In [24]:
count_train = count_vectorizer.fit_transform(X_train)

Transform the test data using only the 'text' column values: count_test 

In [25]:
count_test = count_vectorizer.transform(X_test)

Print the first 10 features of the count_vectorizer

In [26]:
print(count_vectorizer.get_feature_names()[:10])

['aa', 'aaron', 'aarondoubl', 'aaronjeopardi', 'ab', 'aba', 'abandon', 'abba', 'abbey', 'abbeydoubl']


In [27]:
countfeat = count_vectorizer.get_feature_names()
print("There are",len(countfeat),"features in the count vectorizer object for the train data")

There are 20000 features in the count vectorizer object for the train data


Create the CountVectorizer DataFrame: count_df

In [28]:
count_df = pd.DataFrame(count_train.A,
                        columns=count_vectorizer.get_feature_names())

Print a sample of the rows in count_df

In [29]:
count_df.sample(frac=0.00008)

Unnamed: 0,aa,aaron,aarondoubl,aaronjeopardi,ab,aba,abandon,abba,abbey,abbeydoubl,...,zoologya,zoologyin,zoologyth,zoologythi,zoom,zooth,zorba,zorro,zsa,zulu
63329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12691,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134026,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83024,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14652,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16908,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99191,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11057,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17553,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Classifier for CountVectorizer

In Naive Bayes, the rows become classes and the word vector columns become features. The algorithm assumes, naively, that the probability of each feature happening is independent of all other features. However, it uses the Bayes formula from statistics to calculate, quite reliably, the probability that a class belong to a target label (fake or real news), based on a series of probabilities that are already known about the features and classes.

Create a Multinomial Naive Bayes classifier: nb_classifier

In [30]:
nb_classifier = MultinomialNB()

Fit the classifier to the training data

In [31]:
nb_classifier.fit(count_train, y_train)

MultinomialNB()

Create the predicted tags: pred

In [32]:
pred = nb_classifier.predict(count_test)

Compute accuracy score

In [33]:
print ("accuracy score:",metrics.accuracy_score(y_test, pred))
print()

accuracy score: 0.6401943513098823



Calculate the confusion matrix and report

In [34]:
print ("confusion matrix:")
print (metrics.confusion_matrix(y_test, pred, labels=['high','low']))
print()
print ("classification report:")
print (metrics.classification_report(y_test, pred))

confusion matrix:
[[17060 13597]
 [11729 28002]]

classification report:
              precision    recall  f1-score   support

        high       0.59      0.56      0.57     30657
         low       0.67      0.70      0.69     39731

    accuracy                           0.64     70388
   macro avg       0.63      0.63      0.63     70388
weighted avg       0.64      0.64      0.64     70388



# Deep Learning Approach

In [35]:
import tensorflow as tf
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Activation, Dropout
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from keras import utils
import re

In [36]:
def preprocess_text(sen):
    # Remove numbers
    sentence = re.sub(r'\d+', ' ', sen)

    # Remove most punctuation
    punc = '''!()[]{};:"\,<>./?@#$%^&*_~'''
    for ele in sentence:  
        if ele in punc:  
            sentence = sentence.replace(ele, " ") 
    
    # make the characters lowercase
    sentence = sentence.lower()

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Remove stop words
    stop_words = set(stopwords.words("english"))
    sentence = ' '.join([word for word in sentence.split(' ') if word not in stop_words])
    
    # Use stemming
    sentence = ' '.join([stemmer.stem(word) for word in sentence.split(' ')])

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    return sentence

In [37]:
df["preprocessed"] = df["question"].map(preprocess_text)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(df["preprocessed"], df["target"], test_size=0.10, random_state=42)

In [39]:
X_train.head()

203110                         caraca br bueno air br quito
99391     richard mulligan pass muster cast general cust...
104129            on june churchil said battl franc begin '
109670    this former minist agricultur rural develop be...
63543                                           knut hamsun
Name: preprocessed, dtype: object

In [40]:
train_posts = X_train
train_tags = y_train

test_posts = X_test
test_tags = y_test

In [41]:
max_words = 10000
tokenize = Tokenizer(num_words=max_words, char_level=False)

In [42]:
tokenize.fit_on_texts(train_posts) # only fit on train
x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

In [43]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [44]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [45]:
# Inspect the dimenstions of our training and test data
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (191966, 10000)
x_test shape: (21330, 10000)
y_train shape: (191966, 2)
y_test shape: (21330, 2)


In [46]:
# set the batch size for training and the number of epochs in which we will cycle through the training data 
batch_size = 32
epochs = 8

In [47]:
# Build the model
model = Sequential()
model.add(Dense(128, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.60))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [48]:
# train the model
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [49]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.5844350457191467
