# Imports

In [20]:
# Basics
import pandas as pd
import numpy as np
np.random.seed(0)

# Job database wrapper
from JobsDb import JobsDb

# Text preprocessing
from keras.preprocessing import text, sequence

# Modeling
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, GlobalMaxPool1D

# Pipeline tools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelBinarizer, OneHotEncoder
from keras.wrappers.scikit_learn import KerasClassifier

# Load the Data

In [2]:
def load_data():
    db = JobsDb()
    df = db.load_table_as_df('jobs')
    db.close()
    return df

df = load_data()
df.head()

Unnamed: 0,id,title,url,description
0,1,Psychiatrist (Per Diem) - #201222-4667HD,https://www.careerjet.com/jobad/us61b2e1c18a4d...,\n \n shall strive to be a global leader i...
1,2,Professor of Osteopathic Manipulative Medicine,https://www.careerjet.com/jobad/us0f769e5a939c...,\nPosition Details Position Title Professor o...
2,3,Amazon Warehouse Assistant - Immediately Hiring,https://www.careerjet.com/jobad/usba95a3670331...,\n \n Hourly pay rate: $15.00 *The base pa...
3,4,Non-QM Underwriter,https://www.careerjet.com/jobad/usb59eda9438ed...,\nJob Description We looking to add an experi...
4,5,"project manager, accounting systems",https://www.careerjet.com/jobad/us71cfd31d23d6...,\njob description job summary: Project Manag...


# Extract feature and target

In [34]:
def extract_data(df):
    y = df['title'].apply(lambda x: 'data' in x.lower())
    X = df['description']
    return X, y

X, y = extract_data(df)

In [31]:
lb = LabelBinarizer()
y = lb.fit_transform(y)

AttributeError: 'bool' object has no attribute 'any'

In [24]:
def preprocess_descriptions(X):
    tokenizer = text.Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(list(X))
    list_tokenized_descriptions = tokenizer.texts_to_sequences(X)
    X = sequence.pad_sequences(list_tokenized_descriptions, maxlen=500)
    return X
preprocessor = FunctionTransformer(preprocess_descriptions)

# Build baseline classifier

In [25]:
def build_classifier():
    classifier = Sequential()
    embedding_size = 128
    classifier.add(Embedding(20000, embedding_size))
    classifier.add(LSTM(25, return_sequences=True))
    classifier.add(GlobalMaxPool1D())
    classifier.add(Dropout(0.5))
    classifier.add(Dense(50, activation='relu'))
    classifier.add(Dropout(0.5))
    classifier.add(Dense(2, activation='softmax'))
    classifier.compile(
        loss='categorical_crossentropy', 
        optimizer='adam', 
        metrics=['accuracy']
    )   
    return classifier

classifier = KerasClassifier(
    build_fn=build_classifier, 
    epochs=3, 
    batch_size=32, 
    validation_split=0.2
)

# Build Pipeline

In [26]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ]
)

In [27]:
pipe.fit(X, y)

KeyboardInterrupt: 

# Predict classification on an example

In [28]:
doc = df['description'][2000]
doc

'\\n  \\n    Schedule:Â Full time, Part-Time, Morning, Afternoon  Shifts:Â Day, Evening, Weekend  LocationÂ 1700 Fairway Drive, San Leandro, CA 94605 Job opportunities vary by location. We update postings daily with open positions.  SalaryÂ Earn $16.35/hr.  Amazon remains open as an essential business to serve our communities delivering critical supplies directly to the doorsteps of people who need them.  Job DescriptionÂ Join Amazon and become part of a dedicated team that makes shopping a lot easier. Whether the team is Whole Foods Shoppers or PrimeNow, youâ€™re sure to find the part-time role and environment that will work best for you. Core duties for these teams include:  Use a smartphone, manage apps, and scan bar codes Check for order quality Communicate with customers about their orders either verbally or through the app  As aÂ Prime Now Team Member, youâ€™ll be working inside a warehouse and some of your duties will include:  receiving and stowing inventory getting orders read

In [29]:
import eli5
from eli5.lime import TextExplainer

te = TextExplainer(random_state=42)
te.fit(doc, pipe.predict_proba)
te.show_prediction()

AttributeError: 'KerasClassifier' object has no attribute 'model'