# Step 4 - Text classification with Transformer

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

### Job Type

In [2]:
def str_to_num_lst(x):
    x = x.replace("'", "").replace("[", "").replace("]", "").split(", ")
    return list(map(int, x))

In [3]:
train_df = pd.read_csv("data/tmp_type_train.csv")
train_df['job_description'] = train_df['job_description'].apply(lambda x: str_to_num_lst(x))
X_train = train_df["job_description"]
Y_train = train_df["y"]
y_train = np.asarray(Y_train).astype("float32")

test_df = pd.read_csv("data/tmp_type_test.csv")
test_df['job_description'] = test_df['job_description'].apply(lambda x: str_to_num_lst(x))
X_test = test_df["job_description"]
Y_test = test_df["y"]
y_test = np.asarray(Y_test).astype("float32")

In [4]:
max_seq_len = 80
max_features = 30000 

x_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen = max_seq_len)
x_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen = max_seq_len)

In [5]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [6]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [7]:
my_callbacks = [tf.keras.callbacks.EarlyStopping(patience=2)]

embed_dim = 32  # Embedding size for each token
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(max_seq_len,))
embedding_layer = TokenAndPositionEmbedding(max_seq_len, max_features, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model_type = keras.Model(inputs=inputs, outputs=outputs)
model_type.compile(optimizer="rmsprop", loss="mse", metrics=["accuracy"])
history = model_type.fit(
    x_train, 
    y_train, 
    batch_size=32, 
    epochs=3, 
    validation_data=(x_test, y_test),
    callbacks = my_callbacks
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
history_dict = history.history
history_dict.keys()
history_dict = history.history
loss_values = history_dict["loss"]
val_loss_values = history_dict["val_loss"]
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, "bo", label="Training loss")
plt.plot(epochs, val_loss_values, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [8]:
score = model_type.evaluate(x_test, y_test, verbose=0) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])

Test score: 0.3587202727794647
Test accuracy: 0.08408833295106888


### Job Category

In [9]:
train_df = pd.read_csv("data/tmp_category_train.csv")
train_df['job_description'] = train_df['job_description'].apply(lambda x: str_to_num_lst(x))
X_train = train_df["job_description"]
Y_train = train_df["y"]
y_train = np.asarray(Y_train).astype("float32")

test_df = pd.read_csv("data/tmp_category_test.csv")
test_df['job_description'] = test_df['job_description'].apply(lambda x: str_to_num_lst(x))
X_test = test_df["job_description"]
Y_test = test_df["y"]
y_test = np.asarray(Y_test).astype("float32")

In [10]:
max_seq_len = 80
max_features = 30000 # 15000

x_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen = max_seq_len)
x_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen = max_seq_len)

In [11]:
my_callbacks = [tf.keras.callbacks.EarlyStopping(patience=2)]

embed_dim = 32  # Embedding size for each token
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(max_seq_len,))
embedding_layer = TokenAndPositionEmbedding(max_seq_len, max_features, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model_category = keras.Model(inputs=inputs, outputs=outputs)
#model_category.compile(optimizer="rmsprop", loss="mse", metrics=["accuracy"])
#model_category.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model_category.fit(
    x_train, 
    y_train, 
    batch_size=32, 
    epochs=3, 
    validation_data=(x_test, y_test),
    callbacks = my_callbacks
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
history_dict = history.history
history_dict.keys()
history_dict = history.history
loss_values = history_dict["loss"]
val_loss_values = history_dict["val_loss"]
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, "bo", label="Training loss")
plt.plot(epochs, val_loss_values, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [12]:
score = model_category.evaluate(x_test, y_test, verbose=0) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])

Test score: 12.088051795959473
Test accuracy: 0.25226500630378723
