In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
import collections
import math
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
import tensorflow as tf

seed = 54321

2023-09-05 02:05:08.375837: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
url = 'http://cogcomp.org/Data/QA/QC/'
dir_name = 'data'

def download_data(dir_name, filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
  
    os.makedirs(dir_name, exist_ok=True)
    if not os.path.exists(os.path.join(dir_name,filename)):
        filepath, _ = urlretrieve(url + filename, os.path.join(dir_name,filename))
    else:
        filepath = os.path.join(dir_name, filename)
    
    statinfo = os.stat(filepath)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filepath)
    else:
        print(statinfo.st_size)
        raise Exception(
          'Failed to verify ' + filepath + '. Can you get to it with a browser?')
        
    return filepath

train_filename = download_data(dir_name, 'train_5500.label', 335858)
test_filename = download_data(dir_name, 'TREC_10.label',23354)

Found and verified data/train_5500.label
Found and verified data/TREC_10.label


In [3]:
def read_data(filename):
    questions, categories, sub_categories = [], [], []
    with open(filename, 'r', encoding='latin-1') as f:
        # read each line
        for row in f:
            row_str = row.split(':')
            cat, sub_cat_and_question = row_str[0], row_str[1]
            tokens = sub_cat_and_question.split(' ')
            # The first word in sub_cat_and_question is the sub
            # category rest is the question
            sub_cat, question = tokens[0], ' '.join(tokens[1:])

            questions.append(question.lower().strip())
            categories.append(cat)
            sub_categories.append(sub_cat)
    return questions, categories, sub_categories

In [4]:
train_questions, train_categories, train_sub_categories = read_data(train_filename)
test_questions, test_categories, test_sub_categories = read_data(test_filename)

In [5]:
# Define training and testing
train_df = pd.DataFrame(
{'question': train_questions, 'category': train_categories,
    'sub_category': train_sub_categories}
)
test_df = pd.DataFrame(
{'question': test_questions, 'category': test_categories,
    'sub_category': test_sub_categories}
)

In [6]:
# Shuffle the data for better randomization
train_df = train_df.sample(frac=1.0, random_state=seed)

In [7]:
unique_cats = train_df['category'].unique()
# np.arange(unique_cats.shape[0] generates numbers from 0 to size of unique_cats
labels_map = dict(zip(unique_cats, np.arange(unique_cats.shape[0])))
print(f'Label->ID mapping: {labels_map}')

n_classes = len(labels_map)
# convert all string Labels to IDs
train_df['category'] = train_df['category'].map(labels_map)
test_df['category'] = test_df['category'].map(labels_map)

Label->ID mapping: {'DESC': 0, 'ENTY': 1, 'LOC': 2, 'NUM': 3, 'HUM': 4, 'ABBR': 5}


In [8]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(train_df, test_size=0.1)
print(f'train size:{train_df.shape}')
print(f'valid size:{valid_df.shape}')

train size:(4906, 3)
valid size:(546, 3)


In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['question'].tolist())

In [11]:
n_vocab = len(tokenizer.index_word) + 1
print(f"Vocabluary size: {n_vocab}")

Vocabluary size: 7917


In [12]:
train_sequences = tokenizer.texts_to_sequences(train_df['question'].tolist())
train_labels = train_df['category'].values

valid_sequences = tokenizer.texts_to_sequences(valid_df['question'].tolist())
valid_labels = valid_df['category'].values

test_sequences = tokenizer.texts_to_sequences(test_df['question'].tolist())
test_labels = test_df['category'].values

In [13]:
from functools import partial

max_seq_length = 22

preprocessed_res = partial(
    tf.keras.preprocessing.sequence.pad_sequences,
    maxlen=max_seq_length, padding='post', truncating='post')

preprocessed_train_sequences = preprocessed_res(train_sequences)
preprocessed_valid_sequences = preprocessed_res(valid_sequences)
preprocessed_test_sequences = preprocessed_res(test_sequences)

In [14]:
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.models import Model

In [15]:
K.clear_session()

In [16]:
# Input layer takes word IDs as inputs
word_id_inputs = layers.Input(shape=(max_seq_length,), dtype='int32')
# Get the embeddings of the inputs / out [batch_size, sent_length,
# output_dim]
embedding_out = layers.Embedding(input_dim=n_vocab, output_dim=64)(word_id_inputs)

2023-09-05 02:05:11.909873: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-05 02:05:12.015486: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-05 02:05:12.015862: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [17]:
# For all layers: in [batch_size, sent_length, emb_size] / out [batch_
# size, sent_length, 100]

conv1_1 = layers.Conv1D(100, kernel_size=3, 
                        strides=1, padding='same', 
                        activation='relu')(embedding_out)
conv1_2 = layers.Conv1D(100, kernel_size=4, 
                        strides=1, padding='same', 
                        activation='relu')(embedding_out)
conv1_3 = layers.Conv1D(100, kernel_size=5, 
                        strides=1, padding='same', 
                        activation='relu')(embedding_out)


In [18]:
# in previous conv outputs / out [batch_size, sent_length, 300]
conv_out = layers.Concatenate(axis=-1)([conv1_1, conv1_2, conv1_3])

In [19]:
# Pooling over time operation.
# This is doing the max pooling over sequence length
# in other words, each feature map results in a single output
# in [batch_size, sent_length, 300] / out [batch_size, 1, 300]

pool_over_time_out = layers.MaxPool1D(pool_size=max_seq_length, 
                                      padding='valid')(conv_out)
# imply collapses all the dimensions (except the batch dimension)
# to a single dimension
flatten_out = layers.Flatten()(pool_over_time_out)

out = layers.Dense(n_classes, activation='softmax', 
                   kernel_regularizer=regularizers.l2(0.001))(flatten_out)

In [20]:
cnn_model = Model(inputs=word_id_inputs, outputs=out)

cnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

cnn_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 22)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 22, 64)       506688      ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 22, 100)      19300       ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 22, 100)      25700       ['embedding[0][0]']              
                                                                                              

• monitor (str) – Which metric to monitor in order to decay the learning rate. We will
    monitor the validation loss <br>
• factor (float) – By how much to reduce the learning rate. For example, a factor of 0.1
    means that the learning rate will be reduced by 10 times (e.g. 0.01 will be stepped down
    to 0.001) <br>
• patience (int) – How many epochs to wait without an improvement, before reducing
    the learning rate <br>
• mode (string) – Whether to look for an increase or decrease of the metric; ‘auto’ means
    that the direction will be determined by looking at the metric name <br>
• min_delta (float) – How much of an increase/decrease to consider as an improvement <br>
• min_lr (float) – Minimum learning rate (floor)<br>

In [21]:
lr_reduce_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=3, verbose=1,
    mode='auto', min_delta=0.0001, min_lr=0.000001
)

cnn_model.fit(
    preprocessed_train_sequences, train_labels,
    validation_data=(preprocessed_valid_sequences, valid_labels),
    batch_size=128,
    epochs=25,
    callbacks=[lr_reduce_callback]
)

Epoch 1/25


2023-09-05 02:05:15.324893: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-09-05 02:05:16.390382: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x21962d50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-09-05 02:05:16.390429: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce GTX 1650, Compute Capability 7.5
2023-09-05 02:05:16.421708: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-09-05 02:05:16.700346: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 13: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 16: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 19: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 22: ReduceLROnPlateau reducing learning rate to 1e-06.
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f6b62af0910>

In [22]:
cnn_model.evaluate(preprocessed_test_sequences, test_labels, return_dict=True)



{'loss': 0.36357253789901733, 'accuracy': 0.8880000114440918}