# Imports

In [None]:
import pandas as pd
import numpy as np
np.random.seed(0)
from JobsDb import JobsDb
db = JobsDb()
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from nltk import word_tokenize
from gensim.models import word2vec

# Load the Data

In [None]:
df = db.load_table_as_df('jobs')
db.close()
df.head()

# Tag the data

In [None]:
df['target'] = df['title'].apply(lambda x: 'data' in x.lower())

# Extract feature and target

In [None]:
y = pd.get_dummies(df['target'], drop_first=True).values

In [None]:
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(df['description']))
list_tokenized_descriptions = tokenizer.texts_to_sequences(df['description'])
X_t = sequence.pad_sequences(list_tokenized_descriptions, maxlen=500)

# Build baseline model

In [None]:
model = Sequential()

In [None]:
embedding_size = 128
model.add(Embedding(20000, embedding_size))
model.add(LSTM(25, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_t, y, epochs=3, batch_size=32, validation_split=0.1)

In [None]:
model.save('../model/baseline')

# Load model

In [None]:
from tensorflow import keras
model = keras.models.load_model('../model/baseline')
model.

# Predict classification on an example

In [None]:
doc = df['description'][2000]
embedded_doc = X_t[2000]
embedded_doc

In [None]:
model.predict_proba(embedded_doc)

In [None]:
import eli5
from eli5.lime import TextExplainer

te = TextExplainer(random_state=42)
te.fit(embedded_doc, model.predict_proba)
te.show_prediction(target_names=model.target_names)