# Step 3 - Classification of text data using recurrent neural networks (LSTM)

In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

### Get the Data

In [2]:
# https://www.kaggle.com/adarshsng/predicting-job-type-category-by-job-description
def str_to_lst(x):
    return x.replace("'", "").replace("[", "").replace("]", "").split(", ")

In [3]:
df = pd.read_csv("preprocessed_train.csv")
df['job_description'] = df['job_description'].apply(lambda x: str_to_lst(x))
df.head()

Unnamed: 0,job_no,job_description,job_type,category
0,Id-12765,"[zest, scientif, search, accomplish, scientif,...",Permanent,"Pharmaceutical, Healthcare and Medical Sales"
1,Id-1321,"[asha, mistri, umbil, life, recruit, experienc...",Contract/Interim,Pharmaceutical Marketing
2,Id-9473,"[sale, repres, laser, der, medizintechnik, mun...",Permanent,"Pharmaceutical, Healthcare and Medical Sales"
3,Id-14952,"[field, servic, engin, life, scienc, cambridg,...",Permanent,Manufacturing & Operations
4,Id-16692,"[due, increas, busi, demand, award, win, world...",Permanent,"Pharmaceutical, Healthcare and Medical Sales"


### Job Type

In [4]:
Job_type = {
    'Permanent':0,
    'Contract/Interim':1,
    'Contract/Temp':2,
    'Temporary/Seasonal':3,
    'Any':4,
    'Part-Time':5
    }

def set_label(x):
    return Job_type[x]


df['labels'] = df['job_type'].apply(lambda x: set_label(x))
df.head()

Unnamed: 0,job_no,job_description,job_type,category,labels
0,Id-12765,"[zest, scientif, search, accomplish, scientif,...",Permanent,"Pharmaceutical, Healthcare and Medical Sales",0
1,Id-1321,"[asha, mistri, umbil, life, recruit, experienc...",Contract/Interim,Pharmaceutical Marketing,1
2,Id-9473,"[sale, repres, laser, der, medizintechnik, mun...",Permanent,"Pharmaceutical, Healthcare and Medical Sales",0
3,Id-14952,"[field, servic, engin, life, scienc, cambridg,...",Permanent,Manufacturing & Operations,0
4,Id-16692,"[due, increas, busi, demand, award, win, world...",Permanent,"Pharmaceutical, Healthcare and Medical Sales",0


### Vectorization

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(df['job_description'], 
                                                    df['labels'], 
                                                    test_size=0.20, 
                                                    random_state=42)

In [6]:
tokens = []

for lst in df["job_description"]:
    tokens += lst
    
dictionary = list(set(tokens))

def tokens_to_num(lst):
    res = [dictionary.index(i) for i in lst]
    return res

In [7]:
X_train = X_train.apply(tokens_to_num)
X_test = X_test.apply(tokens_to_num)

In [8]:
def vectorize_sequences(sequences, dimension = 15000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for j in sequence:
            results[i, j] = 1.
    return results

In [9]:
x_train = vectorize_sequences(X_train)
x_test = vectorize_sequences(X_test)
y_train = np.asarray(Y_train).astype("float32")
y_test = np.asarray(Y_test).astype("float32")

### Train a recurrent neural network

In [10]:
model = keras.Sequential()
model.add(layers.Embedding(15000, 128)) # max features, embedding dim
model.add(layers.LSTM(32))
model.add(layers.Dense(1, activation = 'sigmoid'))

model.compile(optimizer = "rmsprop",
              loss = "mse",
              metrics = ["accuracy"])

In [None]:
history = model.fit(x_train,
                    y_train,
                    epochs = 5,
                    batch_size = 128,
                    validation_split = 0.1,
                    shuffle = True,
                    verbose = 1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
print(model.summary())

In [None]:
score = model.evaluate(x_test, y_test, verbose=0) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])