In [2]:
import os

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalMaxPooling1D, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score
import torch
from tqdm.notebook import tqdm
import random

We start by importing the entire dataset and removing the unlabeled values. Next, we reindex our subset as removing the unlabeled values would've broken the sequence

In [3]:
data_df = pd.read_csv('JEOPARDY_CSV.csv')
data_df = data_df[data_df[' Value'] != 'None']
data_df = data_df.set_index(pd.Series(range(len(data_df))))

print(data_df.shape)
data_df.head()

(213296, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


Next, we breakup the value into bins, so we can solve this as a classification problem

In [4]:
data_df['ValueNum'] = data_df[' Value'].apply(
    lambda value: int(value.replace(',', '').replace('$', '')))

def binning(value):
    if value < 1000:
        return np.round(value, -2)
    elif value < 10000:
        return np.round(value, -3)
    else:
        return np.round(value, -4)

data_df['ValueBins'] = data_df['ValueNum'].apply(binning)

print("Total number of categories:", data_df[' Value'].unique().shape[0])
print("Number of categories after binning:", data_df['ValueBins'].unique().shape[0])
print("\nBinned Categories:", data_df['ValueBins'].unique())

Total number of categories: 149
Number of categories after binning: 21

Binned Categories: [  200   400   600   800  2000  1000  3000  5000   100   300   500  4000
  7000   700  8000  6000 10000   900  9000     0 20000]


There are a huge number of categories and it they could be helpful so we add them as even more text to our Question column
Since there is class imbalance in the data, we use stratified split into training and validation set.

In [44]:
from sklearn.model_selection import StratifiedShuffleSplit
possible_labels = data_df['ValueBins'].unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
print(label_dict)

data_df[' Question'] = data_df[' Question'] + " " + data_df[' Category']
data_df['label'] = data_df['ValueBins'].replace(label_dict)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.7, random_state=0)
for train_index, test_index in sss.split(data_df[' Question'][:], data_df['label'][:]):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = data_df.loc[train_index.tolist(), ' Question'], data_df.loc[test_index.tolist(), ' Question']
    y_train, y_test = data_df.loc[train_index.tolist(), 'label'], data_df.loc[test_index.tolist(), 'label']
    
    print("X TRAIN:", len(X_train), "X TEST:", len(X_test))
    print("Y TRAIN:", len(y_train), "Y TEST:", len(y_test))

{200: 0, 400: 1, 600: 2, 800: 3, 2000: 4, 1000: 5, 3000: 6, 5000: 7, 100: 8, 300: 9, 500: 10, 4000: 11, 7000: 12, 700: 13, 8000: 14, 6000: 15, 10000: 16, 900: 17, 9000: 18, 0: 19, 20000: 20}
TRAIN: [179001 150382 197021 ... 106340 107061 147312] TEST: [ 86035  25027 140381 ... 175773 210558 105916]
X TRAIN: 63988 X TEST: 149308
Y TRAIN: 63988 Y TEST: 149308


FOR LOGISTIC REGRESSION

Next, we use countvectorizer for 2000 features and prepare our data

In [26]:
bow = CountVectorizer(stop_words='english', max_features=2000)
bow.fit(data_df[' Question'])

X_train = bow.transform(X_train)
X_test = bow.transform(X_test)

# y_train = train_labels
# y_test = test_labels

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (170704, 2000)
Shape of X_test: (42592, 2000)
Shape of y_train: (170704,)
Shape of y_test: (42592,)


In [33]:
%%time
lr = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=200)
lr.fit(X_train, y_train)


Wall time: 2min 40s




LogisticRegression(max_iter=200, multi_class='multinomial', solver='saga')

In [11]:
y_pred = lr.predict(X_test)

print(classification_report(y_test, y_pred))


#     accuracy                           0.19     42592
#    macro avg       0.06      0.06      0.05     42592
# weighted avg       0.16      0.19      0.15     42592

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
         100       0.05      0.00      0.01      1863
         200       0.17      0.14      0.15      6132
         300       0.06      0.00      0.01      1801
         400       0.21      0.57      0.30      8425
         500       0.10      0.01      0.02      1827
         600       0.11      0.01      0.02      4099
         700       0.00      0.00      0.00        41
         800       0.15      0.10      0.12      6279
         900       0.00      0.00      0.00        28
        1000       0.19      0.20      0.20      6720
        2000       0.19      0.10      0.13      4938
        3000       0.00      0.00      0.00       198
        4000       0.00      0.00      0.00       121
        5000       0.00      0.00      0.00        61
        6000       0.00      0.00      0.00        21
        7000       0.00      0.00      0.00         9
        8000       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


The results for logistic regression shows that the less frequent classes do not get classified at all.

FOR LSTM BASED NETWORK

We tokenize and then pad our input to 100 length. 

In [7]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(data_df[' Question'])

train_sequence = tokenizer.texts_to_sequences(data_df.loc[train_index.tolist(), ' Question'])
test_sequence = tokenizer.texts_to_sequences(data_df.loc[test_index.tolist(), ' Question'])

print("Original text:", data_df.loc[0,' Question'])
print("Converted sequence:", train_sequence[0])

Original text: For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory HISTORY
Converted sequence: [2428, 3183, 236, 85, 1189, 4, 2, 640, 1228, 653, 45, 3534, 16, 3, 32696, 7, 681, 16032, 1, 41, 36, 1410]


In [8]:
train_sequence = pad_sequences(train_sequence, maxlen=100)
test_sequence = pad_sequences(test_sequence, maxlen=100)

# le = LabelEncoder()
# le.fit(data_df['ValueBins'])

# y_train = le.transform(train_labels)
# y_test = le.transform(test_labels)

print(train_sequence.shape, test_sequence.shape)
print(y_train.shape, y_test.shape)

(63988, 100) (149308, 100)
(63988,) (149308,)


In [31]:
len(label_dict.keys())

21

In [9]:
num_words = tokenizer.num_words
output_size = len(label_dict.keys())

model = Sequential([
    Embedding(input_dim=num_words, output_dim=200, mask_zero=True, input_length=100),
    Bidirectional(LSTM(150, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(300, activation='relu'),
    Dropout(0.5), #with dropout, accuracy: 0.7942 - val_loss: 5.9379 - val_accuracy: 0.1675
    Dense(output_size, activation='softmax')
    
])

model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          10000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 300)          421200    
_________________________________________________________________
global_max_pooling1d (Global (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 300)               90300     
_________________________________________________________________
dropout (Dropout)            (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 21)                6321      
Total params: 10,517,821
Trainable params: 10,517,821
Non-trainable params: 0
____________________________________________

In [None]:
%%time
import torch
device = torch.device('cpu')

model.fit(train_sequence, y_train, epochs=10, batch_size=1024, validation_split=0.1)

# Epoch 10/10
# 151/151 [==============================] - 89s 587ms/step - loss: 0.5908 - accuracy: 0.7942 - val_loss: 5.9379 - val_accuracy: 0.1675
# Wall time: 14min 53s

In [19]:
y_pred = model.predict(X_test, batch_size=1024).argmax(axis=1)
print(classification_report(y_test, y_pred))


#     accuracy                           0.16     42592
#    macro avg       0.06      0.06      0.06     42592
# weighted avg       0.16      0.16      0.16     42592

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.08      0.05      0.06      1863
           2       0.17      0.17      0.17      6132
           3       0.05      0.04      0.04      1801
           4       0.21      0.25      0.23      8425
           5       0.06      0.05      0.05      1827
           6       0.10      0.11      0.11      4099
           7       0.00      0.00      0.00        41
           8       0.16      0.18      0.17      6279
           9       0.00      0.00      0.00        28
          10       0.18      0.18      0.18      6720
          11       0.18      0.16      0.17      4938
          12       0.01      0.01      0.01       198
          13       0.00      0.00      0.00       121
          14       0.00      0.00      0.00        61
          15       0.00      0.00      0.00        21
          16       0.00      0.00      0.00         9
          17       0.00    

The results of the model reveal that it is terribly overfitted and could do well if we could use a bigger number of features.

For BERT based CLassifier

We begin by using a BertTokenizer for our input data, pad it to a length of 128 and make batches of size 10 to let our GPU handle them in parallel. The attention mask helps the model identify the padded values.

We also sample the training data randomly while the testing data is sampled sequentially. An adam optimizer is used on the model's parameter.

In [46]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)                                          
encoded_data = tokenizer.batch_encode_plus(X_train.values, add_special_tokens=True, return_attention_mask=True, 
                                           padding=True, truncation=True, max_length=128, return_tensors='pt') #padding='max_length'

encoded_data_val = tokenizer.batch_encode_plus(X_test.values, add_special_tokens=True, return_attention_mask=True, 
                                               padding=True, truncation=True, max_length=128, return_tensors='pt')

dataset_train = TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], torch.tensor(y_train.values))
dataset_val = TensorDataset(encoded_data_val['input_ids'], encoded_data_val['attention_mask'], torch.tensor(y_test.values))

print(dataset_train,dataset_val)

<torch.utils.data.dataset.TensorDataset object at 0x00000236C9967EE0> <torch.utils.data.dataset.TensorDataset object at 0x00000236C9967B50>


In [47]:
modelbert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False)
batch_size = 10
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_validation = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

optimizer = AdamW(modelbert.parameters(), lr=1e-5, eps=1e-8)
                  
epochs = 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
modelbert.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

The f1 score function and accuracy_per class function are also defined here. Due to time constraints, I was able to run the model for the entire dataset only for 7 epochs.

In [49]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]} Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')

In [48]:
seed_val = 40
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    modelbert.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = modelbert(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    modelbert.train()
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        modelbert.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = modelbert(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(modelbert.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(modelbert.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=6399.0, style=ProgressStyle(description_wid…


Epoch 1
Training loss: 2.115700426483959



NameError: name 'f1_score_func' is not defined

In [50]:
val_loss, predictions, true_vals = evaluate(dataloader_validation)
val_f1 = f1_score_func(predictions, true_vals)
tqdm.write(f'Validation loss: {val_loss}')
tqdm.write(f'F1 Score (Weighted): {val_f1}')

# Epoch 1
# Training loss: 2.115700426483959

Validation loss: 2.0693240484352384
F1 Score (Weighted): 0.15329579842498575


In [51]:
for epoch in tqdm(range(2, epochs+1)):
    
    modelbert.train()
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        modelbert.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = modelbert(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(modelbert.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(modelbert.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=6399.0, style=ProgressStyle(description_wid…


Epoch 2
Training loss: 2.0115084042678046
Validation loss: 2.05364848626737
F1 Score (Weighted): 0.1617063270002009


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=6399.0, style=ProgressStyle(description_wid…


Epoch 3
Training loss: 1.8589922248395017
Validation loss: 2.120268721157672
F1 Score (Weighted): 0.18427438596736023


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=6399.0, style=ProgressStyle(description_wid…


Epoch 4
Training loss: 1.6594503120922226
Validation loss: 2.3032506704250606
F1 Score (Weighted): 0.1887793071984264


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=6399.0, style=ProgressStyle(description_wid…


Epoch 5
Training loss: 1.457618186149323
Validation loss: 2.574431133177649
F1 Score (Weighted): 0.18866741343343718


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=6399.0, style=ProgressStyle(description_wid…


Epoch 6
Training loss: 1.2632711795898766
Validation loss: 2.8256378553907036
F1 Score (Weighted): 0.19158388762383735


HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=6399.0, style=ProgressStyle(description_wid…


Epoch 7
Training loss: 1.0964879373457865
Validation loss: 3.1199781913965228
F1 Score (Weighted): 0.1899309377312185


HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=6399.0, style=ProgressStyle(description_wid…




KeyboardInterrupt: 

In [52]:
accuracy_per_class(predictions, true_vals)

Class: 200 Accuracy: 5434/21322
Class: 400 Accuracy: 9800/29573
Class: 600 Accuracy: 1288/14265
Class: 800 Accuracy: 3885/22306
Class: 2000 Accuracy: 3141/17663
Class: 1000 Accuracy: 4371/23652
Class: 3000 Accuracy: 0/754
Class: 5000 Accuracy: 0/225
Class: 100 Accuracy: 638/6320
Class: 300 Accuracy: 326/6064
Class: 500 Accuracy: 652/6311
Class: 4000 Accuracy: 0/410
Class: 7000 Accuracy: 0/42
Class: 700 Accuracy: 0/142
Class: 8000 Accuracy: 0/30
Class: 6000 Accuracy: 0/97
Class: 10000 Accuracy: 0/32
Class: 900 Accuracy: 0/80
Class: 9000 Accuracy: 0/11
Class: 0 Accuracy: 0/8
Class: 20000 Accuracy: 0/1
