# Imports

In [2]:
# Basics
import pandas as pd
import numpy as np
np.random.seed(0)

# Job database wrapper
from JobsDb import JobsDb

# Text preprocessing
from keras.preprocessing import text, sequence

# Modeling
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, GlobalMaxPool1D

# Pipeline tools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelBinarizer, OneHotEncoder
from keras.wrappers.scikit_learn import KerasClassifier

# Load the Data

In [3]:
def load_data():
    db = JobsDb()
    df = db.load_table_as_df('jobs')
    db.close()
    return df

df = load_data()
df.head()

Unnamed: 0,id,title,url,description
0,1,Psychiatrist (Per Diem) - #201222-4667HD,https://www.careerjet.com/jobad/us61b2e1c18a4d...,\n \n shall strive to be a global leader i...
1,2,Professor of Osteopathic Manipulative Medicine,https://www.careerjet.com/jobad/us0f769e5a939c...,\nPosition Details Position Title Professor o...
2,3,Amazon Warehouse Assistant - Immediately Hiring,https://www.careerjet.com/jobad/usba95a3670331...,\n \n Hourly pay rate: $15.00 *The base pa...
3,4,Non-QM Underwriter,https://www.careerjet.com/jobad/usb59eda9438ed...,\nJob Description We looking to add an experi...
4,5,"project manager, accounting systems",https://www.careerjet.com/jobad/us71cfd31d23d6...,\njob description job summary: Project Manag...


# Extract feature and target

In [4]:
def extract_data(df):
    y = df['title'].apply(lambda x: 'data' in x.lower())
    X = df['description'].apply(lambda x: )
    return X, y

X, y = extract_data(df)

In [5]:
lb = LabelBinarizer()
y = lb.fit_transform(y)

In [6]:
def preprocess_descriptions(X):
    tokenizer = text.Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(list(X))
    list_tokenized_descriptions = tokenizer.texts_to_sequences(X)
    X = sequence.pad_sequences(list_tokenized_descriptions, maxlen=500)
    return X
preprocessor = FunctionTransformer(preprocess_descriptions)

# Build baseline classifier

In [7]:
def build_classifier():
    classifier = Sequential()
    embedding_size = 128
    classifier.add(Embedding(20000, embedding_size))
    classifier.add(LSTM(25, return_sequences=True))
    classifier.add(GlobalMaxPool1D())
    classifier.add(Dropout(0.5))
    classifier.add(Dense(50, activation='relu'))
    classifier.add(Dropout(0.5))
    classifier.add(Dense(1, activation='sigmoid'))
    classifier.compile(
        loss='binary_crossentropy', 
        optimizer='adam', 
        metrics=['accuracy']
    )   
    return classifier

classifier = KerasClassifier(
    build_fn=build_classifier, 
    epochs=3, 
    batch_size=32, 
    validation_split=0.1
)

# Build Pipeline

In [8]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ]
)

In [9]:
pipe.fit(X, y)

Epoch 1/3
Epoch 2/3
Epoch 3/3


Pipeline(steps=[('preprocessor',
                 FunctionTransformer(func=<function preprocess_descriptions at 0x7f594c103840>)),
                ('classifier',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f58e2ab5f60>)])

# Predict classification on an example

In [36]:
doc = df['description'][1325]
doc

"\\nCompany Description  Pilot Company is the 10th largest privately held company in North America with more than 28,000 team members. As the industry-leading network of travel centers, we have more than 950 retail and fueling locations in 44 states and six Canadian provinces. Our energy and logistics division is a top supplier of fuel, employing one of the largest tanker fleets and providing critical services to oil operations in our nation's busiest basins. Pilot Company supports a growing portfolio of brands with expertise in supply chain and retail operations, logistics and transportation, technology and digital innovation, construction, maintenance, human resources, finance, sales and marketing.  Founded in 1958, we are proud to be family owned and consider our team members to be part of the family. Our founding values, people-first culture and commitment to giving back remains true to us today. Whether we are serving guests, a fellow team member, or a trucking company, we are ded

In [57]:
import eli5
from eli5.lime import TextExplainer

te = TextExplainer(random_state=42)
te.fit(doc, pipe.predict_proba)



TextExplainer(char_based=False,
              clf=SGDClassifier(alpha=0.001, loss='log', penalty='elasticnet',
                                random_state=RandomState(MT19937) at 0x7F5880165150),
              random_state=42,
              sampler=MaskingTextSamplers(random_state=RandomState(MT19937) at 0x7F5880165150,
                                          sampler_params=None,
                                          token_pattern='(?u)\\b\\w+\\b',
                                          weights=array([0.7, 0.3])),
              token_pattern='(?u)\\b\\w+\\b',
              vec=CountVectorizer(ngram_range=(1, 2),
                                  token_pattern='(?u)\\b\\w+\\b'))

In [58]:
te.show_prediction()

Contribution?,Feature
1.961,start
1.229,pilot company
0.887,cleanly
0.849,in
0.741,the
0.724,journey
0.718,culture
0.62,prep
0.606,manage
0.541,to
