In [None]:
#import fasttext
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from scipy.spatial.distance import cosine 
#import pipeline.utils.db as db

In [8]:
job_title_table = db.execute(
    """
    SELECT DISTINCT TRIM(LOWER(job_title_test(job_title_cleaned))) 
    FROM cleaned_job_title.job_title_union_cleaned
    WHERE job_title_cleaned IS NOT NULL 
        AND job_title_cleaned not ilike '%and return%'
    """)

In [9]:
job_title_string = job_title_table['btrim'].tolist()

In [10]:
job_title_string[:10]

['business owner bells nursery',
 'counsel',
 'coordnr',
 'project managere',
 'plumbing supply',
 'employ serv technician iii',
 'sos custodian ii',
 'utility meter field operations',
 'donated',
 'health political consultant']

In [11]:
# random sampling from job_title_string list
from random import sample 

sample_1000 = sample(job_title_string,1000)
with open("sample_1000_strings.txt", "w") as output:
    for e in sample_1000:
        output.write(e + '\n')

In [17]:
sample_10000 = sample(job_title_string,10000)
sample_100000 = sample(job_title_string,100000)

with open("sample_10000_strings.txt", "w") as output:
    for e in sample_10000:
        output.write(e + '\n')

with open("sample_100000_strings.txt", "w") as output:
    for e in sample_100000:
        output.write(e + '\n')

In [20]:
# write entire strings into a text file.
with open("entire_strings.txt", "w") as output:
    for e in job_title_string:
        output.write(e + '\n')

### Save a file as a train data 

In [12]:
model = fasttext.train_unsupervised('sample_1000_strings.txt')

In [18]:
model_10000 = fasttext.train_unsupervised('sample_10000_strings.txt')

In [19]:
model_100000 = fasttext.train_unsupervised('sample_100000_strings.txt')

In [21]:
model_entire = fasttext.train_unsupervised('entire_strings.txt')

In [22]:
model_entire.words

['</s>',
 'director',
 'and',
 'president',
 'manager',
 'staff',
 'assistant',
 'vice',
 'chief',
 'senior',
 'board',
 'executive',
 'for',
 'officer',
 'professor',
 'transportation',
 'representative',
 'associate',
 'services',
 'university',
 'health',
 'sen',
 'washington',
 'department',
 'dc',
 'program',
 'legislative',
 'counsel',
 'office',
 'business',
 'research',
 'specialist',
 'operations',
 'house',
 'ii',
 'incorporated',
 'affairs',
 'supervisor',
 'management',
 'coordinator',
 'development',
 'chair',
 'deputy',
 'technician',
 'policy',
 'member',
 'center',
 'secretary',
 'i',
 'public',
 'us',
 'general',
 'co',
 'owner',
 'committee',
 'analyst',
 'sales',
 'consultant',
 'on',
 'attorney',
 'group',
 'retired',
 'financial',
 'senate',
 'medical',
 'school',
 'engineer',
 'administrative',
 'advisor',
 'senator',
 'company',
 'service',
 'corporate',
 'new',
 'airfare',
 'county',
 'spec',
 'law',
 'llc',
 'in',
 'senators',
 'medicine',
 'division',
 'resour

In [23]:
model_entire.get_word_vector("director")

array([ 0.28254294,  0.11425452,  0.64991885, -0.33367488,  0.14480731,
        0.32661965,  0.11356138, -0.52821726, -0.02379922,  0.59376067,
        0.3995769 , -0.5220222 , -0.32375932, -0.00671822, -0.29120645,
       -0.13525972,  0.15468091,  0.30402485, -0.2705321 , -0.30351445,
        0.27237853,  0.02972574, -0.17408799, -0.7527171 ,  0.20219615,
       -0.12387756, -0.03159215,  0.03329852, -0.22355215, -0.39831924,
       -0.21019736,  0.16828498, -0.29907453, -0.26985803, -0.38436836,
       -0.4098281 , -0.4076704 , -0.17722748, -0.01399893, -0.20677973,
       -0.2315893 , -0.17358483,  0.39348665,  0.14055163,  0.00293521,
       -0.10166384, -0.9690602 , -0.05606403, -0.07040814,  0.754362  ,
       -0.7543453 ,  0.28471643, -0.25636464,  0.4225403 , -0.33933416,
        0.18015964,  0.00380394,  0.04054176,  0.2754836 ,  0.07592319,
        0.19646785,  0.2597035 ,  0.30725324, -0.3422433 , -0.33064035,
        0.16749115,  0.23598629, -0.17007227,  0.04245004, -0.18

In [24]:
# this library also handles unknown word. Sub-word(https://www.google.com/search?q=what+is+sub+word+embedding+post&tbm=isch&ved=2ahUKEwizs6SYyfjnAhVQON8KHb0zBKIQ2-cCegQIABAA&oq=what+is+sub+word+embedding+post&gs_l=img.3...177958.178505..182456...0.0..0.271.1324.2-5......0....1..gws-wiz-img.e-e7aWYhjIE&ei=6EtbXrOfMdDw_Aa955CQCg&bih=897&biw=1920#imgrc=pyWK-uyUK6Hr6M)
#based 이기 때문에 typo 에 robust 한 장점이 있다.
model_entire.get_word_vector("diiirecctorr")

array([-0.07641841, -0.01019991,  0.14039214, -0.10801964,  0.13926701,
        0.06459436,  0.26529774, -0.13913338,  0.16443087,  0.2480375 ,
       -0.02834942, -0.11340211, -0.038286  , -0.05720075, -0.09654281,
       -0.08254253, -0.06039055,  0.13004442, -0.173125  , -0.07439607,
       -0.12404697,  0.06603786, -0.23672588, -0.4343378 ,  0.06400086,
       -0.04939949, -0.17333011,  0.0007918 , -0.14419292, -0.00510129,
       -0.37103295, -0.09441509, -0.07734393, -0.17991202, -0.16950858,
       -0.19071306,  0.09021781, -0.11489707,  0.01149432,  0.10678458,
       -0.0023703 , -0.03798649,  0.15646948,  0.0333753 , -0.0723674 ,
        0.08042976, -0.16062222,  0.03944276,  0.00659632,  0.07281209,
       -0.19456372, -0.17296451, -0.11634398,  0.08607875,  0.07862102,
       -0.00053292,  0.0667675 ,  0.23368883,  0.1962292 ,  0.40421188,
        0.09589522,  0.3046587 ,  0.27409095,  0.01158673, -0.22397102,
        0.00902798, -0.17460425,  0.0174568 ,  0.15116252, -0.12

In [33]:
titles = pd.read_csv('FINAL_fixed_soc_codes_01182018.csv')
cat = pd.read_csv('job classifications - job_title_categorization.csv')
cat['soc_code'] = cat['bls_job_type'].astype(str) + '-' + cat['bls_job_subtype'].astype(str)

df = titles.merge(cat, how='inner', left_on='2018_soc_code', right_on='soc_code')
df['job_title'] = df['soc_direct_match_title'].str.lower().str.strip()
df['label'] = df['job_level']
df = df.dropna()
df.head(n=20)

Unnamed: 0,2018_soc_code,soc_title,soc_direct_match_title,bls_job_type,bls_job_subtype,bls_job_subtype_name,job_level,soc_code,job_title,label
0,11-1011,Chief Executives,Admiral,11,1011,Chief Executives,chief_executive,11-1011,admiral,chief_executive
1,11-1011,Chief Executives,CHIEF EXECUTIVE OFFICER,11,1011,Chief Executives,chief_executive,11-1011,chief executive officer,chief_executive
2,11-1011,Chief Executives,Chief Executive Officer,11,1011,Chief Executives,chief_executive,11-1011,chief executive officer,chief_executive
3,11-1011,Chief Executives,Chief Financial Officer,11,1011,Chief Executives,chief_executive,11-1011,chief financial officer,chief_executive
4,11-1011,Chief Executives,Chief Operating Officer,11,1011,Chief Executives,chief_executive,11-1011,chief operating officer,chief_executive
5,11-1011,Chief Executives,Chief Sustainability Officer,11,1011,Chief Executives,chief_executive,11-1011,chief sustainability officer,chief_executive
6,11-1011,Chief Executives,Commissioner of Internal Revenue,11,1011,Chief Executives,chief_executive,11-1011,commissioner of internal revenue,chief_executive
7,11-1011,Chief Executives,CHIEF OPERATING OFFICER,11,1011,Chief Executives,chief_executive,11-1011,chief operating officer,chief_executive
8,11-1011,Chief Executives,County Commissioner,11,1011,Chief Executives,chief_executive,11-1011,county commissioner,chief_executive
9,11-1011,Chief Executives,Government Service Executive,11,1011,Chief Executives,chief_executive,11-1011,government service executive,chief_executive


In [34]:
texts = df['job_title'].tolist()

In [35]:

X = np.array([model_entire.get_sentence_vector(e) for e in texts]) # vectorize each element in list

X = np.expand_dims(X, axis=2)  # reshape  to 3D

In [36]:
X.shape

(7440, 100, 1)

In [37]:
y = pd.get_dummies(df['label'], drop_first=False)
y.shape

(7440, 5)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)


In [None]:
Model

In [40]:
from keras.models import Sequential
from keras.layers import Input, Embedding, Dense, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D

window_size = 3
drop_out = 0.2
input_shape=(100,1)


model = Sequential()

#model.add(Embedding(max_words, 64, input_length=500))
# dilation_rate = 2 represents mid-layers 이 두 배로 펼쳐진다.. 
model.add(Conv1D(256, kernel_size=window_size, input_shape = input_shape, activation='relu', dilation_rate = 2))
model.add(Dropout(0.2))
model.add(MaxPooling1D(window_size))
model.add(Conv1D(512, window_size, activation='relu', dilation_rate = 2))
model.add(Dropout(0.2))
model.add(MaxPooling1D(window_size))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 96, 256)           1024      
_________________________________________________________________
dropout_1 (Dropout)          (None, 96, 256)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 28, 512)           393728    
_________________________________________________________________
dropout_2 (Dropout)          (None, 28, 512)           0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 9, 512)            0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 512)              

In [41]:
import keras

batch_size = 32
epochs = 50

callback_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        mode='min',
        patience=10
    ),
    
#     keras.callbacks.TensorBoard(
#         log_dir='log_dir_m1',
#         histogram_freq=1,
#         embeddings_freq=1,
#     ),

    keras.callbacks.ModelCheckpoint(
        monitor='val_loss',
        save_best_only=True,
        filepath='job_classification.h5',
    ),

    keras.callbacks.ReduceLROnPlateau(
        patience=10,
        factor=0.1,
    )
]

In [42]:
history = model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.1,
          callbacks=callback_list)

Train on 6026 samples, validate on 670 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50


In [43]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.4699846855414811
Test accuracy: 0.8373655676841736


Based on different models and tests, we have learned the word embedding; it seems that the 500k corpus was enough to explain 6k corpus(train data). 83.7% accuracy is supporting that. 

Considering the scale gap of 2m corpus from fastText and 500k corpus, 500k was a good basis for learning