# Imports

In [1]:
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import os

from TextPreprocessor import TextPreprocessor
from OccupationPreprocessor import OccupationPreprocessor
from TrainEngine import TrainEngine
from Embedder import Embedder, Doc2VecEmbedder

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gradlab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# if NOT working in colab
data_dir = '../data'

# if working in colab
# data_dir = './'

## Load all NOC webpage data into separate dataframes

In [3]:
df_skill_type = pd.read_csv(os.path.join(data_dir, 'NOC_skilltype.csv'))
df_major_group = pd.read_csv(os.path.join(data_dir, './NOC_majorgroup.csv'))
df_minor_group = pd.read_csv(os.path.join(data_dir, './NOC_minorgroup.csv'))
df = pd.read_csv(os.path.join(data_dir, './noc_data_get_byws_dealing_slash.csv'))

In [4]:
# pad missing digits from noc codes
df['Noc_code'] = df['Noc_code'].apply(lambda x: '{0:0>4}'.format(x))

In [5]:
def find_character(string, char):

    occurrences = 0
    for occupation in string.split(';'):
        if char in occupation:
            print(occupation)
            occurrences += 1

    if char in TextPreprocessor.char_occurences:
        TextPreprocessor.char_occurences[char] += occurrences
    else:
        TextPreprocessor.char_occurences[char] = occurrences

# df.sample(500)['job_title'].apply(find_character, args=('(',))

# Unpack all sample job titles in original df

In [6]:
# Do once, if 'noc_code' column already dropped, except to skip action
try:
    df = df.apply(OccupationPreprocessor.extract_job_samples, axis = 1)
except KeyError:
    pass

# Do same with descriptions

In [7]:
df = df.apply(OccupationPreprocessor.unpack_descriptions, axis = 1)

# Make training dataframe

In [8]:
train_df = pd.DataFrame(dict(OccupationPreprocessor.all_job_samples).items(), columns=['input', 'code'])

# Load ATP data for some train noise 

In [9]:
# Load ATP data
ATP_data = pd.DataFrame(pd.read_excel('../Data/V5_Run Input(1).xlsx'))

# Clean codes: many show up as ''0011 or '0011
ATP_data['code'] = ATP_data['NOC code '].apply(
    lambda x: int(x.strip('\''))
).apply(OccupationPreprocessor.first_n_digits, args=(4,))

ATP_data.drop(columns = ['NOC code '], inplace = True)

ATP_data['input'] = ATP_data['Current Job Title']
ATP_data.drop(columns = ['Current Job Title'], inplace = True)

# Shuffle ATP and split into train-val sections 

In [10]:
shuffled_ATP_df = ATP_data.sample(frac=1, random_state=42)

# Sample size of ATP used for training 
ATP_train_size = 8000

# Split  dataset 
ATP_data_train_df = shuffled_ATP_df[:ATP_train_size]
test_df = shuffled_ATP_df[ATP_train_size:]

# Combine both train sets

In [11]:
train_df = train_df[['input', 'code']]
ATP_data_train_df = ATP_data_train_df[['input', 'code']]
test_df = test_df[['input', 'code']]

train_df = train_df.append(ATP_data_train_df)

# Preprocess the entire train and test input

In [12]:
# tfidf_train_df = train_df.copy()
# tfidf_test_df = test_df.copy()
# doc2vec_train_df = train_df.copy()
# doc2vec_test_df = test_df.copy()

### Train

In [13]:
text_preprocessor = TextPreprocessor(strip_abbrev=True)
train_df['input'] = train_df['input'].apply(TextPreprocessor.preprocess_text)

In [14]:
print("Train samples before dropping duplicates", len(train_df))
train_df = train_df.drop_duplicates()
print("Train samples after dropping duplicates", len(train_df))

Train samples before dropping duplicates 37745
Train samples after dropping duplicates 33432


### Test

In [15]:
test_df['input'] = test_df['input'].apply(TextPreprocessor.preprocess_text)

In [16]:
print("Test samples before dropping duplicates", len(test_df))
test_df = test_df.drop_duplicates()
print("Test samples after dropping duplicates", len(test_df))

Test samples before dropping duplicates 32024
Test samples after dropping duplicates 14327


In [17]:
# tfidf_train_df.to_csv('./data/tfidf_train_df.csv', index=False)
# tfidf_test_df.to_csv('./data/tfidf_test_df.csv', index=False)
# doc2vec_train_df.to_csv('./data/doc2vec_train_df.csv', index=False)
# doc2vec_test_df.to_csv('./data/doc2vec_test_df.csv', index=False)

# Grab sample to see if preprocessing worked

In [18]:
def check(string):
    try:
        assert '.' not in string \
            and ',' not in string \
                and ')' not in string \
                    and '(' not in string \
                        and '-' not in string \
                            and ';' not in string \
                                and '/' not in string \
                                    and '\'' not in string
    except AssertionError:
        print(string)

In [19]:
train_df['input'].apply(check)
display(train_df.sample(20))

Unnamed: 0,input,code
14814,journeywoman ironworker,7236
15727,fibreglass insulation installer,7293
4949,electrical and electronics research engineer,2133
8142,chiropractic assistant,3414
24139,paper finishing machine operator,9433
27858,motor vehicle rustproofer,9536
18363,track tamper railway,7531
24109,fourdrinier machine operator pulp and paper,9433
6735,launchwoman,2273
15784,journeyman floor covering installer,7295


## Start Doc2vec code

In [137]:
TRIAL_NAME = '../trial_15.model'

doc2vec_params = dict(
epochs = 6144, # training cycles
vec_size = 64, # specific to doc2vec, size of the output vector
alpha = 0.001, # learning rate
window = 3,
min_count = 2,
min_alpha = 0.00025
)

embedder = Doc2VecEmbedder(
    model_path=TRIAL_NAME,
    d2v_params=doc2vec_params,
    train_data = train_df,
    corpus_column = 'input',
    infer_params = {
        'steps':2048,
        'alpha':0.03
    },
    scoring="hyper"
)

Doc2vec model succesfully loaded from ../trial_15.model


In [138]:
test_occupations = ['doctor', 'athlete', 'member of parliament',
                    'teacher', 'researcher', 'registered nurse', 
                    'CUSTOMER SERVICE', 'MANAGER OF CLEANING BUSINESS',
                   'CAREGIVER', 'Farm Boss']

for occ in test_occupations: 
    occ = TextPreprocessor.preprocess_text(occ)
    print(occ, int(embedder.score_predictions(embedder.infer(occ, verbose=False), level=4, topn=30)))
    

doctor 3112
athlete 5251
member of parliament 6421
teacher 4021
researcher 4164
registered nurse 3012
customer service 6552
manager of cleaning business 1452
caregiver 4412
farm boss 8252


# Quick exact match test

In [139]:
def check_exact_match(row):
    exact_matches = train_df.loc[train_df['input'] == str(row)]
    code = exact_matches['code'].values[0] if len(exact_matches) == 1 else -1
    return code

In [140]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Before doing again, fix so it shows top 3 or 4

In [142]:
print("D2v model {} accuracies on sample:".format(TRIAL_NAME))
print(embedder.infer_params)
accs = []
for i in tqdm(range(50)):
    doc2vec_test_df = test_df.sample(100)#, random_state=42)
    doc2vec_test_df['d2v_preds'] = embedder.infer(doc2vec_test_df['input'])
    # TODO, we can make the score predictions return the top N results, and pass N as an argument
    d2v_preds = pd.Series(embedder.score_predictions(doc2vec_test_df['d2v_preds'], level=4, topn=20)).astype(int)
    accs.append(accuracy_score(
        d2v_preds,
        doc2vec_test_df['code']
    ))
for acc in accs:
    print(acc)

  0%|          | 0/50 [00:00<?, ?it/s]

D2v model ../trial_15.model accuracies on sample:
{'steps': 2048, 'alpha': 0.03}


100%|██████████| 50/50 [06:37<00:00,  7.94s/it]

0.3
0.21
0.31
0.23
0.35
0.29
0.3
0.28
0.29
0.21
0.28
0.27
0.26
0.29
0.25
0.19
0.28
0.3
0.32
0.25
0.24
0.24
0.29
0.27
0.31
0.22
0.22
0.27
0.24
0.28
0.3
0.23
0.29
0.3
0.32
0.29
0.29
0.32
0.26
0.3
0.25
0.2
0.24
0.32
0.19
0.22
0.19
0.27
0.24
0.28





In [126]:
px.histogram(accs)

# Exact Match Logic

In [25]:
doc2vec_test_df['exact_match'] = doc2vec_test_df['input'].apply(check_exact_match)
doc2vec_test_df['exact_matches_TP'] = doc2vec_test_df.apply(lambda row: row['exact_match'] == row['code'], axis=1)
print("Exact match results:\n", doc2vec_test_df['exact_matches_TP'].value_counts())

Exact match results:
 False    90
True     10
Name: exact_matches_TP, dtype: int64
