# Step 3 - Classification of text data using recurrent neural networks (LSTM)

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

### Job Type

In [None]:
def str_to_num_lst(x):
    x = x.replace("'", "").replace("[", "").replace("]", "").split(", ")
    return list(map(int, x))

In [4]:
train_df = pd.read_csv("data/tmp_type_train.csv")
train_df['job_description'] = train_df['job_description'].apply(lambda x: str_to_num_lst(x))
X_train = train_df["job_description"]
Y_train = train_df["y"]
y_train = np.asarray(Y_train).astype("float32")

test_df = pd.read_csv("data/tmp_type_test.csv")
test_df['job_description'] = test_df['job_description'].apply(lambda x: str_to_num_lst(x))
X_test = test_df["job_description"]
Y_test = test_df["y"]
y_test = np.asarray(Y_test).astype("float32")

### Train a recurrent neural network

In [5]:
max_seq_len = 80
max_features = 30000

x_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen = max_seq_len)
x_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen = max_seq_len)

In [6]:
emb_dim = 128

model_type = keras.Sequential()
model_type.add(layers.Embedding(max_features, emb_dim))
model_type.add(layers.LSTM(128))
model_type.add(layers.Dense(1, activation = 'sigmoid'))

In [7]:
model_type.compile(optimizer = "rmsprop",
              loss = "mse",
              metrics = ["accuracy"])

my_callbacks = [tf.keras.callbacks.EarlyStopping(patience=2)]

history = model_type.fit(x_train,
                    y_train,
                    epochs = 5,
                    batch_size = 128,
                    validation_split = 0.2,
                    shuffle = True,
                    verbose = 1,
                    callbacks = my_callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
print(model_type.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         3840000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 3,971,713
Trainable params: 3,971,713
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
score = model_type.evaluate(x_test, y_test, verbose=0) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])

Test score: 0.12206569314002991
Test accuracy: 0.9283692240715027


### Job Category

In [13]:
train_df = pd.read_csv("data/tmp_category_train.csv")
train_df['job_description'] = train_df['job_description'].apply(lambda x: str_to_num_lst(x))
X_train = train_df["job_description"]
Y_train = train_df["y"]
y_train = np.asarray(Y_train).astype("float32")

test_df = pd.read_csv("data/tmp_category_test.csv")
test_df['job_description'] = test_df['job_description'].apply(lambda x: str_to_num_lst(x))
X_test = test_df["job_description"]
Y_test = test_df["y"]
y_test = np.asarray(Y_test).astype("float32")

In [21]:
max_seq_len = 80
max_features = 20000

x_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen = max_seq_len)
x_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen = max_seq_len)

In [22]:
emb_dim = 128

model_category = keras.Sequential()
model_category.add(layers.Embedding(max_features, emb_dim))
model_category.add(layers.LSTM(128))
model_category.add(layers.Dense(1, activation = 'sigmoid'))

In [23]:
model_category.compile(optimizer = "rmsprop",
              loss = "mse",
              metrics = ["accuracy"])


my_callbacks = [tf.keras.callbacks.EarlyStopping(patience=2)]

history = model_category.fit(x_train,
                    y_train,
                    epochs = 5,
                    batch_size = 128,
                    validation_split = 0.2,
                    shuffle = True,
                    verbose = 1,
                    callbacks = my_callbacks)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [24]:
print(model_category.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 128)         2560000   
                                                                 
 lstm_4 (LSTM)               (None, 128)               131584    
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
score = model_category.evaluate(x_test, y_test, verbose=0) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])

Test score: 10.039920806884766
Test accuracy: 0.18261608481407166
