In [2]:
import requests
from bs4 import BeautifulSoup
import re
import os
import time

import random

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

import csv
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))

print(tf.__version__)

2.1.0


[nltk_data] Downloading package stopwords to /home/robin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Function to scrape job openings from Indeed

In [5]:
def scraper(job_titles):

    job_openings = [] # list with job opening text
    labels = [] # list with job opening labels

    main = "https://www.indeed.com" # main indeed url

    for item in job_titles:

        job = '"' + item.replace(" ", "+") + '"' # replace space in job title with dash

        print(job)

        for number in range(0, 2000, 10): # will scrape 500 job openings

            print(f"starting first {number} of {item}")

            url = f"{main}/jobs?q={job}&start={number}" # compile job specific url

            print(url)
            
            try:
                page = requests.get(url) # request url
            except:
                break
                print("An error occured")
            
            time.sleep(2)

            soup = BeautifulSoup(page.content, "html.parser") # parse the page with beautifulsoup

            links = [a['href'] for a in soup.find_all('a', href=True)] # find all the urls on the page

            match_links = [main+s for s in links if "rc/clk?jk=" in s] # find all the urls that link to job page
            
            print(len(match_links))

            for link in match_links: # loop through the 10 job ad urls on the page
                
                print(link)
                
                try:
                    job_page = requests.get(link) # request the job ad url
                except:
                    break
                    print("An error occured")
                    
                time.sleep(3) # wait a few seconds to not overlad server
                job_soup = BeautifulSoup(job_page.content, "html.parser") # parse the page

                for element in job_soup.find_all('div', attrs={"class":"jobsearch-jobDescriptionText"}): # loop through the job text
                    text = element.get_text(separator = " ").replace("\n","") # remove all tags from job text
                    job_openings.append(text) # append job text tot job_openings
                    labels.append(item) # append job title

            if len(match_links) < 5: # breaks the loop if we are at the end of the # of jobs
                break                    
    
    return job_openings, labels

The scraper can take a list with any number of job titles and will take the first 500 job openings.

In [None]:
job_openings = ["data hero",
                "data scientist",
                "data analyst",
                "business analyst",
                "data engineer",
                "machine learning engineer",
                "data architect"]

jobs, labels = scraper(job_openings)

In [None]:
job_openings2 = [# "data hero",
                # "data scientist",
                "data analyst",
                "business analyst",
                "data engineer",
                "machine learning engineer",
                "data architect"]

jobs2, labels2 = scraper(job_openings2)

"data+analyst"
starting first 0 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=0
10
https://www.indeed.com/rc/clk?jk=b9ab2261f8c09965&fccid=7890456d161b1285&vjs=3
https://www.indeed.com/rc/clk?jk=058a5d8b147b3117&fccid=d5998a961671e818&vjs=3
https://www.indeed.com/rc/clk?jk=3819d9f7ce42b610&fccid=10b1f58e33d8d2a8&vjs=3
https://www.indeed.com/rc/clk?jk=b967baa627475133&fccid=e74b668d26ed43b3&vjs=3
https://www.indeed.com/rc/clk?jk=7ec66a909282b0f4&fccid=66d394411dd2efad&vjs=3
https://www.indeed.com/rc/clk?jk=57125117d2b4b0f6&fccid=ae6e171391e978d5&vjs=3
https://www.indeed.com/rc/clk?jk=2f69b7d86ea813ab&fccid=2f1303bf45b0d7af&vjs=3
https://www.indeed.com/rc/clk?jk=6e9cdfd005d5d84f&fccid=de21c88cd020293b&vjs=3
https://www.indeed.com/rc/clk?jk=adc8162797f38f34&fccid=fe2d21eef233e94a&vjs=3
https://www.indeed.com/rc/clk?jk=730d4f5920e34bfa&fccid=d85dc84e1f190d4a&vjs=3
starting first 10 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=10
9
https://www.inde

https://www.indeed.com/rc/clk?jk=fe3e5c324c8a1d35&fccid=0f8d26ec6e64e7ed&vjs=3
https://www.indeed.com/rc/clk?jk=7ac504066e319539&fccid=008026d8dc2aa7bd&vjs=3
https://www.indeed.com/rc/clk?jk=d62541d01cbb05d1&fccid=bba15db4d8763697&vjs=3
https://www.indeed.com/rc/clk?jk=fa85fb93709b0184&fccid=76d3630986f5b206&vjs=3
https://www.indeed.com/rc/clk?jk=e630740301b166e5&fccid=f4d8d9827523c093&vjs=3
https://www.indeed.com/rc/clk?jk=bdf917ce016c840e&fccid=ec798be206d0335c&vjs=3
starting first 100 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=100
10
https://www.indeed.com/rc/clk?jk=d207cc8e13e389ed&fccid=bba15db4d8763697&vjs=3
https://www.indeed.com/rc/clk?jk=a1cd7c5d6e24488e&fccid=2150e83695978641&vjs=3
https://www.indeed.com/rc/clk?jk=54dd8869dbe7bd7a&fccid=519cd80cc4a237e4&vjs=3
https://www.indeed.com/rc/clk?jk=803066ef97bc0914&fccid=6f28e45337f5b8ff&vjs=3
https://www.indeed.com/rc/clk?jk=3b5bcfe46b0999c7&fccid=56c25448bab54e90&vjs=3
https://www.indeed.com/rc/clk?jk=26656

10
https://www.indeed.com/rc/clk?jk=5d4665f94e896653&fccid=5c542a53657c2a31&vjs=3
https://www.indeed.com/rc/clk?jk=d5bd9c1f26890db2&fccid=39d9d123b68dadca&vjs=3
https://www.indeed.com/rc/clk?jk=27fda9cd845fbb75&fccid=fd3e5b9fc7b91d7c&vjs=3
https://www.indeed.com/rc/clk?jk=8cb89b28b63265f8&fccid=9b4e0bba38b405da&vjs=3
https://www.indeed.com/rc/clk?jk=d936ecd0124761a7&fccid=62d1f6aaef9a271b&vjs=3
https://www.indeed.com/rc/clk?jk=6d96d68ff583f0c5&fccid=3aaabf5c54e51db9&vjs=3
https://www.indeed.com/rc/clk?jk=a4b0b1182c6ecf2e&fccid=5302b2b4db4fded5&vjs=3
https://www.indeed.com/rc/clk?jk=e53947bbf2d9cbcb&fccid=4c069f49b73c7f31&vjs=3
https://www.indeed.com/rc/clk?jk=3760b9c770df02e7&fccid=1d2bef48e6fe1cfc&vjs=3
https://www.indeed.com/rc/clk?jk=16dec0f4c6c33128&fccid=c1099851e9794854&vjs=3
starting first 200 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=200
10
https://www.indeed.com/rc/clk?jk=f00030c7701d5368&fccid=d81bbfe26fad932d&vjs=3
https://www.indeed.com/rc/clk?jk=ff

https://www.indeed.com/rc/clk?jk=539ec7d5d0eb36a9&fccid=bd976cc171c690e0&vjs=3
https://www.indeed.com/rc/clk?jk=12dbf9a393bfcf7f&fccid=4ba35c44ff727b61&vjs=3
https://www.indeed.com/rc/clk?jk=8c6bad588576f113&fccid=a5b4499d9e91a5c6&vjs=3
https://www.indeed.com/rc/clk?jk=5a0b556b09afcdb2&fccid=f085b82fff829085&vjs=3
https://www.indeed.com/rc/clk?jk=bc944e4f1901985a&fccid=21df030fae150acc&vjs=3
starting first 280 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=280
9
https://www.indeed.com/rc/clk?jk=d20d5386e729ea62&fccid=6262eae176df0db9&vjs=3
https://www.indeed.com/rc/clk?jk=feb58b47ffc13dab&fccid=80f650ebe16034d3&vjs=3
https://www.indeed.com/rc/clk?jk=9a0b6e44be850e8a&fccid=104dac531ec80260&vjs=3
https://www.indeed.com/rc/clk?jk=dc76fb7e2b99d56a&fccid=e87c07079f6e369f&vjs=3
https://www.indeed.com/rc/clk?jk=1993bee97ad01b88&fccid=fe2d21eef233e94a&vjs=3
https://www.indeed.com/rc/clk?jk=a4b0b1182c6ecf2e&fccid=5302b2b4db4fded5&vjs=3
https://www.indeed.com/rc/clk?jk=eed379

https://www.indeed.com/rc/clk?jk=a46f46522032ead8&fccid=ea25315ee9da22e5&vjs=3
https://www.indeed.com/rc/clk?jk=dcc0260f3a08a4c8&fccid=91bbd9dc41595b3e&vjs=3
https://www.indeed.com/rc/clk?jk=f09a6d6e7cf24434&fccid=d3d3520998346837&vjs=3
https://www.indeed.com/rc/clk?jk=644c8573d9573b15&fccid=f82779f9f5d1aa9b&vjs=3
https://www.indeed.com/rc/clk?jk=3483ac001da233ed&fccid=e571ceda55e25d27&vjs=3
starting first 370 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=370
10
https://www.indeed.com/rc/clk?jk=78edb3c22959e53e&fccid=fb556c90cb9bdb1a&vjs=3
https://www.indeed.com/rc/clk?jk=f99c4e9340a045dd&fccid=8e1b3eba64c2138d&vjs=3
https://www.indeed.com/rc/clk?jk=2e9af25cf16d26a5&fccid=4e73084b634b861d&vjs=3
https://www.indeed.com/rc/clk?jk=900756df71faa905&fccid=214358c3b9aaf27e&vjs=3
https://www.indeed.com/rc/clk?jk=15d433e99e063a5a&fccid=f03f4be28dbdabc8&vjs=3
https://www.indeed.com/rc/clk?jk=4a852dcebc00e69a&fccid=22d5413651f4514c&vjs=3
https://www.indeed.com/rc/clk?jk=3ad65

https://www.indeed.com/rc/clk?jk=6e5d44d73298abf6&fccid=ec4d3f88affe63ec&vjs=3
https://www.indeed.com/rc/clk?jk=a89408fbe84c934e&fccid=c629e32155ebd42c&vjs=3
https://www.indeed.com/rc/clk?jk=d43dd6d34859059a&fccid=f031515c17652ecd&vjs=3
https://www.indeed.com/rc/clk?jk=efe8ad44c6232051&fccid=6b7a1dfe07e7f037&vjs=3
https://www.indeed.com/rc/clk?jk=e3c5cdfd47480d95&fccid=50b6ec04dddc3e05&vjs=3
starting first 460 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=460
10
https://www.indeed.com/rc/clk?jk=d8bb99b62126fdd2&fccid=7e2f25c82f274980&vjs=3
https://www.indeed.com/rc/clk?jk=e017645bb51a88a0&fccid=e2e8cfb17a406cbf&vjs=3
https://www.indeed.com/rc/clk?jk=ad6fe0dcb36035a3&fccid=761110cb1eadb0d9&vjs=3
https://www.indeed.com/rc/clk?jk=99d25708aac3436c&fccid=8c49e99f20f89e48&vjs=3
https://www.indeed.com/rc/clk?jk=5877ff5b25bfee9c&fccid=dd616958bd9ddc12&vjs=3
https://www.indeed.com/rc/clk?jk=8fac3958aa1d5a47&fccid=a575b7c5c50b587a&vjs=3
https://www.indeed.com/rc/clk?jk=859f8

https://www.indeed.com/rc/clk?jk=f141d6bd9c2dd62c&fccid=099c16915f5e821b&vjs=3
https://www.indeed.com/rc/clk?jk=d3b40845113fffdf&fccid=2c20676c7c49d50b&vjs=3
https://www.indeed.com/rc/clk?jk=486d458b417e8849&fccid=9997e73403fe863a&vjs=3
https://www.indeed.com/rc/clk?jk=1510e716a05dabce&fccid=f8ffb6ca1136aaff&vjs=3
https://www.indeed.com/rc/clk?jk=de2c83950d66eee0&fccid=8510f9bc3a085f8e&vjs=3
https://www.indeed.com/rc/clk?jk=1c6519e59180d5c1&fccid=ac38a400acc8fe73&vjs=3
starting first 550 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=550
10
https://www.indeed.com/rc/clk?jk=ca57a3d7421309d4&fccid=b7066b0b5972b293&vjs=3
https://www.indeed.com/rc/clk?jk=1fe454dba0284020&fccid=12f3ee1c1d804535&vjs=3
https://www.indeed.com/rc/clk?jk=a0eba1bc68cdb7b8&fccid=0dd0f2dc4ecc3d85&vjs=3
https://www.indeed.com/rc/clk?jk=3ce86ed0a07b2c2b&fccid=1639254ea84748b5&vjs=3
https://www.indeed.com/rc/clk?jk=707c5c96b6973f93&fccid=1b3c0eb3b7fdba77&vjs=3
https://www.indeed.com/rc/clk?jk=dcd69

https://www.indeed.com/rc/clk?jk=a3b31c7a153201b5&fccid=b66c01ddfaeadf60&vjs=3
https://www.indeed.com/rc/clk?jk=91faf893b6cf3647&fccid=f74a07f61f1a7daa&vjs=3
https://www.indeed.com/rc/clk?jk=dc69b47711734fa8&fccid=fd3e5b9fc7b91d7c&vjs=3
https://www.indeed.com/rc/clk?jk=77ab746063ed4456&fccid=df69b0ebcdfb7759&vjs=3
https://www.indeed.com/rc/clk?jk=32923ce76e6af22e&fccid=fcf465e707df52fb&vjs=3
https://www.indeed.com/rc/clk?jk=0d9b4ab521efe3c5&fccid=52228f7dabab36c1&vjs=3
https://www.indeed.com/rc/clk?jk=0984befaee99ca47&fccid=cdf50da77e83deaa&vjs=3
starting first 640 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=640
10
https://www.indeed.com/rc/clk?jk=d227d77c433c4f6b&fccid=382851991121b94d&vjs=3
https://www.indeed.com/rc/clk?jk=06ee5ffa770c0ea4&fccid=ee24dd71b9bf38cc&vjs=3
https://www.indeed.com/rc/clk?jk=ca832629122b8154&fccid=3639654bd4055c70&vjs=3
https://www.indeed.com/rc/clk?jk=8c75b1fcac38897a&fccid=aa53b551f9df0210&vjs=3
https://www.indeed.com/rc/clk?jk=7f304

10
https://www.indeed.com/rc/clk?jk=30d3af830056ac3c&fccid=34938366d45106af&vjs=3
https://www.indeed.com/rc/clk?jk=59bf368092f2ba32&fccid=c4b7f809097a101b&vjs=3
https://www.indeed.com/rc/clk?jk=b74f25b6e4d9fdfc&fccid=a5b4499d9e91a5c6&vjs=3
https://www.indeed.com/rc/clk?jk=dea66e343c257e18&fccid=7eb0ae92a6869041&vjs=3
https://www.indeed.com/rc/clk?jk=07e757ba9780d1f0&fccid=b8ada337c2ed4289&vjs=3
https://www.indeed.com/rc/clk?jk=b97005d3f2a15888&fccid=126e3afd205caa95&vjs=3
https://www.indeed.com/rc/clk?jk=eb2c596e8c26c173&fccid=95449ad419d914fa&vjs=3
https://www.indeed.com/rc/clk?jk=7e91ec71e338efb1&fccid=51f891c56d3b3c59&vjs=3
https://www.indeed.com/rc/clk?jk=3797b0574ddf9945&fccid=104dac531ec80260&vjs=3
https://www.indeed.com/rc/clk?jk=42123b50b8f8e49d&fccid=b6854e4a276db20a&vjs=3
starting first 740 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=740
10
https://www.indeed.com/rc/clk?jk=ac141759f79f8109&fccid=7942658690eba67c&vjs=3
https://www.indeed.com/rc/clk?jk=c1

https://www.indeed.com/rc/clk?jk=a81c4c47d5180b69&fccid=7bab4fa744cc39cd&vjs=3
https://www.indeed.com/rc/clk?jk=6a1cef3bf298c60f&fccid=3d0506c7b87d872a&vjs=3
https://www.indeed.com/rc/clk?jk=bf3107035c46d90d&fccid=74fbc768a4e5bece&vjs=3
https://www.indeed.com/rc/clk?jk=bfb28a0293886ba4&fccid=2231941280824ce7&vjs=3
https://www.indeed.com/rc/clk?jk=ae6fd0507667d081&fccid=2dda8667d8de78d8&vjs=3
https://www.indeed.com/rc/clk?jk=2fb936f1fe24a9ea&fccid=3c41eea00c4f384f&vjs=3
https://www.indeed.com/rc/clk?jk=59786585ec6110d9&fccid=6ce7730e12276932&vjs=3
https://www.indeed.com/rc/clk?jk=3464c194bbac9279&fccid=caaf1a960095c607&vjs=3
starting first 830 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=830
9
https://www.indeed.com/rc/clk?jk=bae6281a65ac98fa&fccid=938c68fc89db4b9d&vjs=3
https://www.indeed.com/rc/clk?jk=0d569ab949c419e0&fccid=40c24390c4fe6086&vjs=3
https://www.indeed.com/rc/clk?jk=bfe6376855137893&fccid=47d1258571c3c9f7&vjs=3
https://www.indeed.com/rc/clk?jk=ed284c

https://www.indeed.com/rc/clk?jk=7f1e42c244d50286&fccid=62d0bf8f2d130eaa&vjs=3
https://www.indeed.com/rc/clk?jk=fcc3c58ff68e9b21&fccid=c1099851e9794854&vjs=3
starting first 920 of data analyst
https://www.indeed.com/jobs?q="data+analyst"&start=920
10
https://www.indeed.com/rc/clk?jk=1ec9d924cc2a452e&fccid=f1557471a26b6171&vjs=3
https://www.indeed.com/rc/clk?jk=46a3d13d956ab4bd&fccid=7136762d065a5ad7&vjs=3
https://www.indeed.com/rc/clk?jk=690280bead761573&fccid=6328b27691d3fdc3&vjs=3
https://www.indeed.com/rc/clk?jk=901fb6a023bb7f12&fccid=dcf1eb2d7950bbb8&vjs=3
https://www.indeed.com/rc/clk?jk=160f292ca2d3a8cd&fccid=3c41eea00c4f384f&vjs=3
https://www.indeed.com/rc/clk?jk=47e7e5b0d7da06bb&fccid=b544304db34cd3c6&vjs=3
https://www.indeed.com/rc/clk?jk=2fc99c3f7688198d&fccid=7db14b470ed12322&vjs=3
https://www.indeed.com/rc/clk?jk=c31a1fa4eac572f2&fccid=ab0ef6192557a06f&vjs=3
https://www.indeed.com/rc/clk?jk=6455d0bbd6b5f2c0&fccid=8a2313e992935dec&vjs=3
starting first 930 of data analyst
htt

## Preparing the data

For convenience I have scraped the job openings of 4 job titles and put them in a csv so I don't have to run the scraper every session (it takes some time to complete). 

First, load the csv and randomize

In [None]:
df = pd.read_csv('data/data-set.csv')
df = df.sample(frac=1).reset_index(drop=True)

conditions = [
    (df['labels'] == 'data scientist'),
    (df['labels'] == 'sales manager'),
    (df['labels'] == 'front-office manager'),
    (df['labels'] == 'front-end developer')]
choices = [0, 1, 2, 3]
df['labels_num'] = np.select(conditions, choices) # make label column numerica

jobs = df['jobs'].to_numpy()
labels = df['labels_num'].to_numpy()

Then, remove the stopwords

In [None]:
jobs_clean = []

for i in jobs:
    job = i
    for word in STOPWORDS:
        token = ' ' + word + ' '
        job = job.replace(token, ' ')
        job = job.replace(' ', ' ')
    jobs_clean.append(job)

Then, set our paramaters

In [None]:
vocab_size = 8000 # most common words
embedding_dim = 64
max_length = 600
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # not in word index
training_portion = .8 # portion of training set

One-hot-encode label column

In [None]:
labels = tf.keras.utils.to_categorical(labels, num_classes=4)

Convert them to a train and test set

In [None]:
train_size = int(len(jobs_clean) * training_portion)

train_jobs = jobs_clean[0: train_size]
train_labels = labels[0: train_size]

validation_jobs = jobs_clean[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_jobs))
print(len(train_labels))
print(len(validation_jobs))
print(len(validation_labels))

Tokenizer

OOV is the most common, followed by experience, etc.

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_jobs)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

Turn into list of sequence

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_jobs)
print(train_sequences[10]) # 11th job opening

In [None]:
type(train_jobs)

In [None]:
train_sequences[0]

Pad sequences so they are all the same size. train_padded is length 500

In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(train_sequences[0])) # old
print(len(train_padded[0])) # new

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

Do the same for validation sequences

In [None]:
validation_sequences = tokenizer.texts_to_sequences(validation_jobs)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

Compare original job text and job text after padding and tokenization

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_job(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_job(train_padded[12]))
print('---')
print(train_jobs[12])

## Building the model

## Try #1 - Stacked LSTM layers

In [None]:
# vocab_size = 8000 # most common words
# embedding_dim = 64
# max_length = 600

In [None]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 8000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
    # Extra LSTM layer
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 4 nodes (1 for each class) and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(4, activation='softmax')
])
model.summary()

In [None]:
train_padded_array = np.asarray(train_padded)


model.compile(loss='categorical_croentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 50
history = model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(validation_padded, validation_labels), verbose=2)

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

A big gap between val_loss and training loss, indicating overfit

## Try #2 - Very simple single LSTM layer

In [None]:
# vocab_size = 8000 # most common words
# embedding_dim = 64
# max_length = 600

In [None]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 8000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 4 nodes (1 for each class) and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(4, activation='softmax')
])
model.summary()

In [None]:
train_padded_array = np.asarray(train_padded)


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 5
history = model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(validation_padded, validation_labels), verbose=2)

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

## Save model

In [None]:
model.save('model/model1.h5')

In [None]:
import pickle

# saving
with open('model/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('model/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## CV inlezen

In [None]:
import docx

doc =  docx.Document('data/CV_robinfaber.docx')

In [None]:
def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text.replace('\t', ' '))
    return ' '.join(fullText)

In [None]:
text = getText('data/CV_robinfaber.docx')

In [None]:
type(text)

In [None]:
for i in text:
    job = i
    for word in STOPWORDS:
        token = ' ' + word + ' '
        job = job.replace(token, ' ')
        job = job.replace(' ', ' ')
    jobs_clean.append(job)

In [None]:
seq = tokenizer.texts_to_sequences(text)
padded = pad_sequences(seq, maxlen=max_length)
pred = model.predict(padded)
labels = ['data scientist', 'sales manager', 'front-office manager', 'front-end developer']
print(pred, labels[np.argmax(pred)])

In [None]:
padded

In [None]:
np.argmax(pred)

In [None]:
seq