# Hacker News Title Classification


The original blog is on google

https://cloud.google.com/blog/products/ai-machine-learning/ai-in-depth-creating-preprocessing-model-serving-affinity-with-custom-online-prediction-on-ai-platform-serving

* We are going to train and build a title text classification model using kera and the sequence model method. 
* We will then embed the trained model in a fastapi prediction server so that we can send classification jobs using plain old http requests


In [1]:
# Download the dataset to local directory using gsutil
#!gsutil cp -r gs://cloud-training-demos/blogs/CMLE_custom_prediction data

In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text
import tensorflow as tf
import pickle

In [2]:
train = pd.read_csv('data/train.tsv', sep='\t')
print(train.shape)
train = shuffle(train, random_state=22)
train.head()

(72161, 2)


Unnamed: 0,github,feminist-software-foundation complains about reddit on a github pull request
70983,techcrunch,nintendo 37 million wii consoles sold in the ...
39988,nytimes,nsa leaks revive russian push for internet reg...
66705,techcrunch,lg to develop youtube phone - keeping up with ...
40833,nytimes,venezuela s inflated vision of beauty video
28520,nytimes,stubborn skills gap in america s work force


### Load train and eval data and one-hot encode the labels

In [3]:
def load_data(train_file, eval_file):
    encoder = OneHotEncoder()
    
    train_set = pd.read_csv('data/{}'.format(train_file), sep='\t')
    train_set.columns = ['label', 'title']
    train_set = shuffle(train_set, random_state=22)
    train_hot = encoder.fit_transform(train_set[['label']]).toarray()
    
    eval_set = pd.read_csv('data/{}'.format(eval_file), sep='\t')
    eval_set.columns = ['label', 'title']
    eval_hot = encoder.transform(eval_set[['label']]).toarray()
    
    return (train_set['title'].values, train_hot), (eval_set['title'].values, eval_hot)

(train_texts, train_labels), (eval_texts, eval_labels) = load_data('train.tsv', 'eval.tsv')
print('text: %s' % train_texts[0])
print('label: %s' % train_labels[0])
print(len(train_texts), len(train_labels))

print('text: %s' % eval_texts[0])
print('label: %s' % eval_labels[0])
print(len(eval_texts), len(eval_labels))

text: nintendo  37 million wii consoles sold in the u.s. to date
label: [0. 0. 1.]
72161 72161
text: geoip module on nodejs now is a c   addon
label: [1. 0. 0.]
24040 24040


In [4]:
class TextPreprocessor(object):
 def __init__(self, vocab_size, max_sequence_length):
   self._vocab_size = vocab_size
   self._max_sequence_length = max_sequence_length
   self._tokenizer = None

 def fit(self, text_list):       
   # Create vocabulary from input corpus.
   tokenizer = text.Tokenizer(num_words=self._vocab_size)
   tokenizer.fit_on_texts(text_list)
   self._tokenizer = tokenizer

 def transform(self, text_list):       
   # Transform text to sequence of integers
   text_sequence = self._tokenizer.texts_to_sequences(text_list)

   # Fix sequence length to max value. Sequences shorter than the length are
   # padded in the beginning and sequences longer are truncated
   # at the beginning.
   padded_text_sequence = sequence.pad_sequences(text_sequence, maxlen=self._max_sequence_length)
   return padded_text_sequence

In [5]:
VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 50

((train_texts, train_labels), (eval_texts, eval_labels)) = load_data( 'train.tsv', 'eval.tsv')

# Create vocabulary from training corpus.
processor = TextPreprocessor(VOCAB_SIZE, MAX_SEQUENCE_LENGTH)
processor.fit(train_texts)

# Preprocess the data
train_texts_vectorized = processor.transform(train_texts)
eval_texts_vectorized = processor.transform(eval_texts)

In [6]:
print(train_texts_vectorized[5])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0   34 3300  680 1727    4  543   64]


In [7]:
train_labels.shape

(72161, 3)

### Save the text processor state

In [10]:
with open('./processor_state.pkl', 'wb') as f:
 pickle.dump(processor, f, protocol=4)

### Create some models

In [10]:
def create_model_1(vocab_size, embedding_dim, filters, kernel_size, dropout_rate, pool_size, classes):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size,
                                        output_dim=embedding_dim,
                                        input_length=MAX_SEQUENCE_LENGTH))
    model.add(tf.keras.layers.Dropout(rate=dropout_rate))
    model.add(tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size,  activation='relu'))
    model.add(tf.keras.layers.MaxPooling1D(pool_size=pool_size))
    model.add(tf.keras.layers.Conv1D(filters=filters * 2, kernel_size=kernel_size, activation='relu'))
    model.add(tf.keras.layers.GlobalAveragePooling1D())
    model.add(tf.keras.layers.Dropout(rate=dropout_rate))
    model.add(tf.keras.layers.Dense(len(CLASSES), activation='softmax'))
    return model

In [11]:
def create_model_2(vocab_size, embedding_dim, filters, kernel_size, dropout_rate, pool_size, classes):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size, 
                                        output_dim=embedding_dim, 
                                        input_length=MAX_SEQUENCE_LENGTH))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(3, activation='sigmoid'))
    return model

In [12]:
def create_model_3(vocab_size, embedding_dim, filters, kernel_size, dropout_rate, pool_size, classes):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size, 
                                        output_dim=embedding_dim, 
                                        input_length=MAX_SEQUENCE_LENGTH))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(3, activation='softmax'))
    return model

### Train and save the model

In [13]:
LEARNING_RATE = .001
EMBEDDING_DIM = 200
FILTERS = 64
KERNEL_SIZE = 3
DROPOUT_RATE = 0.2
POOL_SIZE = 3
CLASSES = ['github', 'nytimes', 'techcrunch']

model = create_model_1(VOCAB_SIZE, EMBEDDING_DIM, FILTERS, KERNEL_SIZE, DROPOUT_RATE,POOL_SIZE, CLASSES)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 200)           4000000   
_________________________________________________________________
dropout (Dropout)            (None, 50, 200)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 48, 64)            38464     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 16, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 14, 128)           24704     
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0

2022-04-28 10:04:47.661853: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-28 10:04:47.661936: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-28 10:04:47.661969: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (a58774662d82): /proc/driver/nvidia/version does not exist
2022-04-28 10:04:47.662373: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer, 
              loss='categorical_crossentropy',
              metrics=['acc'])

In [15]:
print(train_labels.shape)
print(train_texts_vectorized.shape)
print(train_labels[0:5])

(72161, 3)
(72161, 50)
[[0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [16]:
NUM_EPOCH=1
BATCH_SIZE=128

model.fit(train_texts_vectorized, train_labels, epochs=NUM_EPOCH, batch_size=BATCH_SIZE)
print('Eval loss/accuracy:{}'.format(model.evaluate(eval_texts_vectorized, eval_labels, batch_size=BATCH_SIZE)))

2022-04-28 10:04:47.852544: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Eval loss/accuracy:[0.39317235350608826, 0.8423044681549072]


In [17]:
model.save('saved_model')

2022-04-28 10:05:22.521166: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: saved_model/assets
