# Library

In [None]:
pip install pyvi



In [None]:
pip install tensorflow-gpu==1.15.0



In [None]:
pip install keras==2.2.4



In [None]:
pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-rx1mpx6s
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-rx1mpx6s


In [None]:
pip install vncorenlp



In [None]:
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

--2021-11-15 04:08:09--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: ‘VnCoreNLP-1.1.1.jar’


2021-11-15 04:08:09 (165 MB/s) - ‘VnCoreNLP-1.1.1.jar’ saved [27412575/27412575]

--2021-11-15 04:08:09--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526544 (514K) [application/octet-stream]
Saving to: ‘vi-voc

# Hyperparameters

In [None]:
EPOCH = 10
MAX_LEN = 80
BATCH_SIZE = 64
MAX_FEATURE = 10000
EMBEDING_DIM = 300
NUM_LABEL = 68
TOKENIZER = 'vncorenlp'

EMBEDDING = 'drive/My Drive/CODE/JobPrediction/embedding/word2vec_vi_words_300dims.txt'

TRAIN = 'drive/MyDrive/CODE/JobPrediction/dataset/raw_data_new/train.csv'
DEV = 'drive/MyDrive/CODE/JobPrediction/dataset/raw_data_new/dev.csv'
TEST = 'drive/MyDrive/CODE/JobPrediction/dataset/raw_data_new/test.csv'

LABEL = 'drive/MyDrive/CODE/JobPrediction/dataset/raw_data_new/labels.csv'

MODEL_PATH = 'drive/My Drive/CODE/JobPrediction/model_new/single/PhoW2V_300.h5'

TASK = 'job_description'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Evalutaion metric

In [None]:
# Evaluation metric
import sys
import os
import os.path
from scipy.stats import sem
import numpy as np
from ast import literal_eval
import tensorflow as tf

def em_score(y_true, y_pred):
    MR = np.all(y_pred == y_true, axis=1).mean()
    return MR

def accuracy_score(y_true, y_pred):
    temp = 0
    for i in range(0, len(y_true)):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    return temp / len(y_true)

 
def f1_score(y_true, y_pred):
    temp = 0
    for i in range(len(y_true)):
        if (sum(y_true[i]) == 0) and (sum(y_pred[i]) == 0):
            continue
        temp+= (2*sum(np.logical_and(y_true[i], y_pred[i]))) / (sum(y_true[i])+sum(y_pred[i]))
    return temp/ len(y_true)

# Data loader

In [None]:
import pandas as pd

train = pd.read_csv(TRAIN)
dev = pd.read_csv(DEV)
test = pd.read_csv(TEST)

label = pd.read_csv(LABEL)

In [None]:
job_types = label['job_type'].values

In [None]:
import numpy as np

def make_label(data):
    lbl_job = []
    for td in data['job'].values:
        l_job_onehot = np.zeros(len(job_types))
        
        for i in range(0, len(job_types)):
            if job_types[i] in td:
                l_job_onehot[i] = 1
        lbl_job.append(l_job_onehot)

    return lbl_job

def return_label(y):
    lbl_job = []
    for i in range(0, len(y)):
        if y[i] == 1:
            lbl_job.append(job_types[i])

    return lbl_job

In [None]:
len(train)

20234

In [None]:
len(dev)

1760

In [None]:
len(test)

3933

# Word embedding 

In [None]:
# Read embedding
word_dict = []
embeddings_index = {}
embedding_dim = EMBEDING_DIM
max_feature = MAX_FEATURE

f = open(EMBEDDING)
for line in f:
    values = line.split(' ')
    word = values[0] 
    word_dict.append(word)
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except Exception as e:
        pass
f.close()

print('Embedding data loaded')

Embedding data loaded


In [None]:
words = word_dict
num_words = len(words)

# Dictionary word:index pair
# word is key and its value is corresponding index
word_to_index = {w : i + 2 for i, w in enumerate(words)}
word_to_index["UNK"] = 1
word_to_index["PAD"] = 0

# Dictionary lable:index pair
idx2word = {i: w for w, i in word_to_index.items()}

In [None]:
# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_to_index.items():
    if i > max_feature:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

# Pre-process

In [None]:
y_train = make_label(train)
y_dev = make_label(dev)
y_test = make_label(test)

X_train = train[TASK]
X_dev = dev[TASK]
X_test = test[TASK]

In [None]:
from pyvi import ViTokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical
from vncorenlp import VnCoreNLP
import re

vncorenlp = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

max_len = MAX_LEN

def custom_tokenizer(text_data, tokenizer='pyvi'):
    # text_data = text_data.lower()
    if tokenizer == 'vncorenlp':
        # return " ".join(vncorenlp.tokenize(str(text_data))[0])
        text = ""
        lst = vncorenlp.tokenize(str(text_data))
        for t in lst:
            text += " ".join(t)
        return text
    if tokenizer == 'none':
        return text_data
    return ViTokenizer.tokenize(str(text_data))

def encoding(X, y, tokenizer = True):
    sentences = []
    
    for t in X:
        t = re.sub(r"[-()\"#/@;:<>{}`+=~|!?,]", "", t)
        sentences.append(custom_tokenizer(t, tokenizer=TOKENIZER))
    
    # X = []
    # for s in sentences:
    #     sent = []
    #     for w in s.split():
    #         try:
    #             w = w.lower()
    #             sent.append(word_to_index[w])
    #         except:
    #             sent.append(word_to_index["UNK"])
    #     X.append(sent)
    
    X = tokenizer.texts_to_sequences(sentences)
    X = pad_sequences(X, maxlen=max_len)

    # X = pad_sequences(maxlen = max_len, sequences = X, padding = "post", value = word_to_index["PAD"])

    return (X,y)


def decoding(text_data, encoding_text, prediction):
    test = [[idx2word[i] for i in row] for row in encoding_text]

    lst_token = []

    for t in range(0, len(test)):
        yy_pred = []
        for i in range(0, len(test[t])):
            if prediction[t][i] == 1:
                yy_pred.append(test[t][i])
        lst_token.append(yy_pred)

    lis_idx = []
    for i in range(0, len(text_data)):
        idx = []
        for t in lst_token[i]:
            index = text_data[i].find(t)
            idx.append(index)
            for j in range(1, len(t)):
                index = index + 1
                idx.append(index)
        lis_idx.append(idx)

    return lis_idx

Using TensorFlow backend.


In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(lower=False, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
tokenizer.fit_on_texts(X_train)

In [None]:
X1, y1 = encoding(X_train, y_train, tokenizer)
X2, y2 = encoding(X_dev, y_dev, tokenizer)
X3, y3 = encoding(X_test, y_test, tokenizer)

# Model

In [None]:
from keras.layers import LSTM, Dense, Concatenate, Embedding, Bidirectional, GlobalMaxPooling1D, Dropout, Reshape, GRU, SpatialDropout1D, Conv1D, GlobalAveragePooling1D
from keras.models import Model, Input
from keras.utils import plot_model
from tensorflow.python.keras.optimizers import Adam
from tensorflow.keras.initializers import Constant

from tensorflow.keras.losses import BinaryCrossentropy

import warnings
warnings.filterwarnings("ignore")

units = 100

input = Input(shape = (max_len,))
emb = Embedding(input_dim=num_words+2,
                output_dim=embedding_dim,
                embeddings_initializer=Constant(embedding_matrix),
                input_length=max_len,
                trainable=True)(input)
x1 = SpatialDropout1D(0.2)(emb)

x = Bidirectional(GRU(units, return_sequences = True))(x1)
x = Conv1D(int(units/2), kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    
y = Bidirectional(LSTM(units, return_sequences = True))(x1)
y = Conv1D(int(units/2), kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)
    
avg_pool1 = GlobalAveragePooling1D()(x)
max_pool1 = GlobalMaxPooling1D()(x)
    
avg_pool2 = GlobalAveragePooling1D()(y)
max_pool2 = GlobalMaxPooling1D()(y)
    
    
x = Concatenate(axis=-1)([avg_pool1, max_pool1, avg_pool2, max_pool2])
x = Dropout(0.5)(x)
out = Dense(68, activation = "sigmoid")(x)

model = Model(input, out)

model.compile(optimizer='adam', loss=BinaryCrossentropy(from_logits=False), metrics=['acc'])

model.summary()

plot_model(model ,show_shapes=True,show_layer_names=True)




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     476253000   input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 200, 300)     0           embedding_1[0][0]                
_____________________________________________________________________________________________

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
import warnings
warnings.filterwarnings("ignore")

callback = EarlyStopping(monitor='val_loss', patience=1)

model.fit(X1, np.array(y1), validation_data=(X2, np.array(y2)), batch_size=256, epochs=30)
model.save(MODEL_PATH)




Train on 20234 samples, validate on 1760 samples
Epoch 1/30







## Evaluate

In [None]:
y_dev_pred = model.predict(X2)
y_test_pred = model.predict(X3)

In [None]:
y_dev_pred_new = []

for y in y_dev_pred:
    lb = []
    for i in range(0, len(y)):
        if y[i] >= 0.5:
            lb.append(1)
        else:
            lb.append(0)
    y_dev_pred_new.append(lb)

y_test_pred_new = []

for y in y_test_pred:
    lb = []
    for i in range(0, len(y)):
        if y[i] >= 0.5:
            lb.append(1)
        else:
            lb.append(0)
    y_test_pred_new.append(lb)

In [None]:
f1_score(y_dev, y_dev_pred_new)*100, accuracy_score(y_dev, y_dev_pred_new)*100

In [None]:
f1_score(y_test, y_test_pred_new)*100, accuracy_score(y_test, y_test_pred_new)*100

# Error analysis

In [None]:
label_dev = []
label_dev_pred = []
for i in range(len(y_dev)):
    label_dev.append(return_label(y_dev[i]))
    label_dev_pred.append(return_label(y_dev_pred_new[i]))

label_test = []
label_test_pred = []
for i in range(len(y_test)):
    label_test.append(return_label(y_test[i]))
    label_test_pred.append(return_label(y_test_pred_new[i]))

In [None]:
dev_result = pd.DataFrame([list(X_dev), label_dev, label_dev_pred]).T
test_result = pd.DataFrame([list(X_test), label_test, label_test_pred]).T

In [None]:
header = ['description', 'true_label', 'predicted_label']
dev_result.columns = header
test_result.columns = header

dev_result.head()