# Imports

In [3]:
import pandas as pd
import numpy as np
np.random.seed(0)
from JobsDb import JobsDb
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from nltk import word_tokenize
from gensim.models import word2vec
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import FunctionTransformer

# Load the Data

In [4]:
def load_data():
    db = JobsDb()
    df = db.load_table_as_df('jobs')
    db.close()
    return df

In [5]:
df = load_data()
df.head()

Unnamed: 0,id,title,url,description
0,1,Psychiatrist (Per Diem) - #201222-4667HD,https://www.careerjet.com/jobad/us61b2e1c18a4d...,\n \n shall strive to be a global leader i...
1,2,Professor of Osteopathic Manipulative Medicine,https://www.careerjet.com/jobad/us0f769e5a939c...,\nPosition Details Position Title Professor o...
2,3,Amazon Warehouse Assistant - Immediately Hiring,https://www.careerjet.com/jobad/usba95a3670331...,\n \n Hourly pay rate: $15.00 *The base pa...
3,4,Non-QM Underwriter,https://www.careerjet.com/jobad/usb59eda9438ed...,\nJob Description We looking to add an experi...
4,5,"project manager, accounting systems",https://www.careerjet.com/jobad/us71cfd31d23d6...,\njob description job summary: Project Manag...


# Extract feature and target

In [6]:
def extract_data(df):
    y = df['title'].apply(lambda x: 'data' in x.lower())
    X = df['description']
    return X, y
    

In [7]:
X, y = extract_data(df)

In [8]:
def preprocessor(X, y=None):
    y = pd.get_dummies(y, drop_first=True).values
    tokenizer = text.Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(list(X))
    list_tokenized_descriptions = tokenizer.texts_to_sequences(X)
    X = sequence.pad_sequences(list_tokenized_descriptions, maxlen=500)
    return X, y
Preprocessor = FunctionTransformer(preprocessor)

In [9]:
X, y = preprocessor(X,y)

# Build baseline classifier

In [None]:
def build_classifier():
    classifier = Sequential()
    embedding_size = 128
    classifier.add(Embedding(20000, embedding_size))
    classifier.add(LSTM(25, return_sequences=True))
    classifier.add(GlobalMaxPool1D())
    classifier.add(Dropout(0.5))
    classifier.add(Dense(50, activation='relu'))
    classifier.add(Dropout(0.5))
    classifier.add(Dense(1, activation='softmax'))
    classifier.compile(
        loss='categorical_crossentropy', 
        optimizer='adam', 
        metrics=['accuracy']
    )   
    return classifier

classifier = KerasClassifier(build_fn=build_classifier)

In [None]:
classifier.fit(X, y)

In [None]:
classifier.summary()

In [None]:
classifier.save('../classifier/baseline')

# Load classifier

In [None]:
from tensorflow import keras
classifier = keras.classifiers.load_classifier('../classifier/baseline')
classifier.

# Predict classification on an example

In [None]:
doc = df['description'][2000]
embedded_doc = X_t[2000]
embedded_doc

In [None]:
classifier.predict_proba(embedded_doc)

In [None]:
import eli5
from eli5.lime import TextExplainer

te = TextExplainer(random_state=42)
te.fit(embedded_doc, classifier.predict_proba)
te.show_prediction(target_names=classifier.target_names)