# Imports

In [2]:
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import os

from scripts.TextPreprocessor import TextPreprocessor
from scripts.OccupationPreprocessor import OccupationPreprocessor
from scripts.TrainEngine import TrainEngine
from scripts.Embedder import Embedder

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gradlab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# if NOT working in colab
data_dir = './data'

# if working in colab
# data_dir = './'

## Load all NOC webpage data into separate dataframes

In [4]:
df_skill_type = pd.read_csv(os.path.join(data_dir, 'NOC_skilltype.csv'))
df_major_group = pd.read_csv(os.path.join(data_dir, './NOC_majorgroup.csv'))
df_minor_group = pd.read_csv(os.path.join(data_dir, './NOC_minorgroup.csv'))
df = pd.read_csv(os.path.join(data_dir, './noc_data_get_byws_dealing_slash.csv'))

In [5]:
# pad missing digits from noc codes
df['Noc_code'] = df['Noc_code'].apply(lambda x: '{0:0>4}'.format(x))

In [8]:
def find_character(string, char):

    occurrences = 0
    for occupation in string.split(';'):
        if char in occupation:
            print(occupation)
            occurrences += 1

    if char in TextPreprocessor.char_occurences:
        TextPreprocessor.char_occurences[char] += occurrences
    else:
        TextPreprocessor.char_occurences[char] = occurrences

# df.sample(500)['job_title'].apply(find_character, args=('(',))

# Unpack all sample job titles in original df

In [9]:
# Do once, if 'noc_code' column already dropped, except to skip action
try:
    df = df.apply(OccupationPreprocessor.extract_job_samples, axis = 1)
except KeyError:
    pass

# Do same with descriptions

In [10]:
df = df.apply(OccupationPreprocessor.unpack_descriptions, axis = 1)

# Make training dataframe

In [11]:
train_df = pd.DataFrame(dict(OccupationPreprocessor.all_job_samples).items(), columns=['input', 'code'])

# Load ATP data for some train noise 

In [12]:
# Load ATP data
ATP_data = pd.DataFrame(pd.read_excel('./Data/V5_Run Input(1).xlsx'))

# Clean codes: many show up as ''0011 or '0011
ATP_data['code'] = ATP_data['NOC code '].apply(
    lambda x: int(x.strip('\''))
).apply(OccupationPreprocessor.first_n_digits, args=(4,))

ATP_data.drop(columns = ['NOC code '], inplace = True)

ATP_data['input'] = ATP_data['Current Job Title']
ATP_data.drop(columns = ['Current Job Title'], inplace = True)

# Shuffle ATP and split into train-val sections 

In [13]:
shuffled_ATP_df = ATP_data.sample(frac=1, random_state=42)

# Sample size of ATP used for training 
ATP_train_size = 8000

# Split  dataset 
ATP_data_train_df = shuffled_ATP_df[:ATP_train_size]
test_df = shuffled_ATP_df[ATP_train_size:]

# Combine both train sets

In [14]:
train_df = train_df[['input', 'code']]
ATP_data_train_df = ATP_data_train_df[['input', 'code']]
test_df = test_df[['input', 'code']]

train_df = train_df.append(ATP_data_train_df)

In [20]:
ATP_data.iloc[test_df.index]

Unnamed: 0,Current Industry,code,input
3507,"RECEPTION, ACCOUNTING, BOOKINGS, PARTS ORDERS,...",1221,OFFICE ADMINISTRATOR
24582,HOUSING AND SUPPLEMENT PROGRAMS,423,HOUSING MANAGER
18410,ROOFING,7291,ROOF ESTIMATOR
27460,RANCH WORK,821,OWNER RANCHER
12522,SCHOOL,4032,TEACHER
...,...,...,...
6265,TAX PREPARATION AND BOOKKEEPING,1431,TAX PREPARER
11284,HEALTH CARE AID (CARE FOR ELDERLY),3413,PERSONAL SUPPORT AID
38158,"CLERICAL, ADMINISTRATION, ADVISOR, CONFLICT RE...",1121,HUMAN RESOURCE OFFICER
860,PRIVATE SCHOOL,422,ENROLLMENT COORDINATOR


# Preprocess the entire train and test input

In [13]:
# tfidf_train_df = train_df.copy()
# tfidf_test_df = test_df.copy()
# doc2vec_train_df = train_df.copy()
# doc2vec_test_df = test_df.copy()

### Train

In [23]:
text_preprocessor = TextPreprocessor(strip_abbrev=True)
train_df['input'] = train_df['input'].apply(TextPreprocessor.preprocess_text)

In [24]:
print("Train samples before dropping duplicates", len(train_df))
train_df = train_df.drop_duplicates()
print("Train samples after dropping duplicates", len(train_df))

Train samples before dropping duplicates 37745
Train samples after dropping duplicates 33432


### Test

In [25]:
test_df['input'] = test_df['input'].apply(TextPreprocessor.preprocess_text)

In [26]:
print("Test samples before dropping duplicates", len(test_df))
test_df = test_df.drop_duplicates()
print("Test samples after dropping duplicates", len(test_df))

Test samples before dropping duplicates 32024
Test samples after dropping duplicates 14327


In [1]:
doc2vec_test_df

NameError: name 'doc2vec_test_df' is not defined

In [27]:
# tfidf_train_df.to_csv('./data/tfidf_train_df.csv', index=False)
# tfidf_test_df.to_csv('./data/tfidf_test_df.csv', index=False)
# doc2vec_train_df.to_csv('./data/doc2vec_train_df.csv', index=False)
# doc2vec_test_df.to_csv('./data/doc2vec_test_df.csv', index=False)

# Grab sample to see if preprocessing worked

In [None]:
def check(string):
    try:
        assert '.' not in string \
            and ',' not in string \
                and ')' not in string \
                    and '(' not in string \
                        and '-' not in string \
                            and ';' not in string \
                                and '/' not in string \
                                    and '\'' not in string
    except AssertionError:
        print(string)

In [None]:
train_df['input'].apply(check)
display(train_df.sample(20))

## Start Doc2vec code

In [None]:
TRIAL_NAME = 'trial_11'

doc2vec_params = dict(
epochs = 6144, # training cycles
vec_size = 64, # specific to doc2vec, size of the output vector
alpha = 0.001, # learning rate
window = 3,
min_count = 2,
min_alpha = 0.00025
)

embedder = Embedder(
    d2v_trial_name=TRIAL_NAME,
    d2v_params=doc2vec_params,
    train_data = train_df,
    corpus_column = 'input',
    infer_params = {
        'steps':2048,
        'alpha':0.03
    }
)

In [None]:
embedder.train_doc2vec()

In [None]:
embedder.load_doc2vec_model()

In [None]:
TextPreprocessor.abbreviations_map

In [None]:
test_occupations = ['doctor', 'athlete', 'member of parliament',
                    'teacher', 'researcher', 'registered nurse', 
                    'CUSTOMER SERVICE', 'MANAGER OF CLEANING BUSINESS',
                   'CAREGIVER', 'Farm Boss']

for occ in test_occupations: 
    occ = TextPreprocessor.preprocess_text(occ)
    print(embedder.infer_and_vote(occ, verbose=True))
    

# vectorize train data

In [None]:
embedder.train_tfidf()

# Apply embeddings to training data

In [None]:
assert 'input' in embedder.train_df.columns and 'code' in embedder.train_df.columns, "Make sure train dataframe has 'input' column and 'code' column"
train_d2v_embeddings = train_df['input'].apply(
    Embedder.get_doc2vec_embeddings, args=(embedder,)
)

## Embed X_train

In [21]:
len(doc2vec_train_df[doc2vec_train_df.code == 3012])

196

In [None]:
TARGET_CODE_LENGTH = 4

In [None]:
# d2v_train_vectors = Embedder.vectorize_embeddings(train_d2v_embeddings)
tfidf_train_vectors = Embedder.get_tfidf_embeddings(embedder, train_df['input'])

# assert d2v_train_vectors.shape[0] == tfidf_train_vectors.shape[0]

## Get first n digits of y_train

In [None]:
X_train = tfidf_train_vectors # d2v_train_vectors
y_train = np.array(train_df['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,))
)

## Embed X_test

In [None]:
# test_d2v_embeddings = test_df['input'].apply(
#     Embedder.get_doc2vec_embeddings, args=(embedder,)
# )
# d2v_test_vectors = Embedder.vectorize_embeddings(test_d2v_embeddings)
tfidf_test_vectors = Embedder.get_tfidf_embeddings(embedder, test_df['input'])

# assert d2v_test_vectors.shape[0] == tfidf_test_vectors.shape[0]

## Get first n digits of y_test

In [None]:
X_test = tfidf_test_vectors # d2v_test_vectors
y_test = np.array(test_df['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,))
)

# Build preliminary classifiers

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
import time

# Train TFIDF

In [None]:
SVM = SVC(class_weight='balanced', kernel='linear')

start = time.time()
SVM.fit(X_train, y_train)
print('SVM training duration: {} seconds'.format(time.time()-start))

In [None]:
RF = RandomForestClassifier(n_estimators=64, max_depth=128, n_jobs=-1, warm_start=True)

start = time.time()
RF.fit(X_train, y_train)
print('RF training duration: {} seconds'.format(time.time()-start))

In [None]:
KNN = KNeighborsClassifier(n_neighbors = 1, n_jobs=-1)

start = time.time()
KNN.fit(X_train, y_train)
print('KNN training duration: {} seconds'.format(time.time()-start))

In [None]:
LR = LogisticRegression(n_jobs=-1)

start = time.time()
LR.fit(X_train, y_train)
print('LR training duration: {} seconds'.format(time.time()-start))

# Predict TFIDF

In [None]:
start = time.time()
svm_pred = SVM.predict(X_test)
print('SVM prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

In [None]:
start = time.time()
rf_pred = RF.predict(X_test)
print('RF prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

In [None]:
start = time.time()
knn_pred = KNN.predict(X_test)
print('KNN prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

In [None]:
start = time.time()
lr_pred = LR.predict(X_test)
print('LR prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

# Get Metrics

In [None]:
from sklearn.metrics import accuracy_score, f1_score
tfidf_test_df = pd.DataFrame({
    'svm_pred':svm_pred,
#     'rf_pred':rf_pred,
    'knn_pred':knn_pred,
    'lr_pred':lr_pred,
    'code':y_test
})

from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn', 'svm', 'lr']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        tfidf_test_df['{}_pred'.format(classifier)], 
                                        y_test
                                    ),
                                    f1_score(
                                        tfidf_test_df['{}_pred'.format(classifier)],
                                        y_test, average = 'macro')
                                   )
     )
    
# KNN acc:0.476, f1-macro:0.3900045178082561
# SVM acc:0.5, f1-macro:0.4011703553848126
# RF acc:0.42, f1-macro:0.3912001209939354

### Get this inside the embedder class

In [None]:
from collections import Counter
def ensemble_predict(row, predictor_cols, default_predictor):

    # find majority vote for all methods, :-1 drops ground truth column
    votes = Counter(row[predictor_cols]).most_common(1)
    
    # take svm as tie-breaker because CURRENTLY most accurate
    winning_class, highest_num_votes = votes[0]
    return winning_class


# Ensemble Predict TFIDF

In [None]:
tfidf_test_df['p_all'] = tfidf_test_df.apply(ensemble_predict, axis = 1, args = (
    ['svm_pred','knn_pred', 'lr_pred'], 'svm_pred',
))

In [None]:
import pickle
with open('TFIDF_SVM_KNN_LR.pkl', 'wb') as f:
    pickle.dump({
        'SVM':SVM,
        'KNN':KNN,
        'LR':LR
#         'RF':RF
    }, f)

In [None]:
print('Ensemble acc:{}, f1:{}'.format(accuracy_score(tfidf_test_df['p_all'], y_test), 
                                      f1_score(tfidf_test_df['p_all'], y_test, average = 'macro')))
display(tfidf_test_df.iloc[:20][['p_all','code']])

# TODO: Adjust doc2vec testing to work with new code. Also get ensemble vote working for tfidf predictor
# Is preprocessing hurting the TFIDF?

In [None]:
doc2vec_test_df = test_df.sample(5000, random_state=42)

# Quick exact match test

In [None]:
def check_exact_match(row):
    exact_matches = train_df.loc[train_df['input'] == str(row)]
    code = exact_matches['code'].values[0] if len(exact_matches) == 1 else -1
    return code

In [None]:
doc2vec_test_df['exact_match'] = doc2vec_test_df['input'].apply(check_exact_match)
doc2vec_test_df['exact_matches_TP'] = doc2vec_test_df.apply(lambda row: row['exact_match'] == row['code'], axis=1)

In [None]:
doc2vec_test_df['exact_matches_TP'].value_counts()

# Trial for tuning infer params

In [None]:
doc2vec_test_df['vote1'], doc2vec_test_df['vote2'], doc2vec_test_df['vote3'] = None, None, None
votes = []
for row in doc2vec_test_df.itertuples():
    if row.exact_match == -1:
        votes.append(embedder.infer_and_vote(row.input, verbose=False))
    else:
        votes.append(pd.Series([-1, -1, -1]))
        
doc2vec_test_df[['vote1', 'vote2', 'vote3']] = votes
TPs = doc2vec_test_df.apply(lambda row: int(row['code']) in [row['vote1'], row['vote2'], row['vote3'], row['exact_match']], axis = 1)

In [None]:
pd.DataFrame(TPs.value_counts())

In [None]:
3077/5000

In [None]:
test_d2v_embeddings = doc2vec_test_df['input'].apply(get_doc2vec_embeddings, args=(embedder,))
doc2vec_test_df['doc2vec_embeddings'] = test_d2v_embeddings

In [None]:
vectorized_embeddings = vectorize_embeddings(test_d2v_embeddings)

In [None]:
tfidf_test_df['rf_pred'] = RF.predict(vectorized_embeddings)

In [None]:
tfidf_test_df['knn_pred'] = KNN.predict(vectorized_embeddings)

In [None]:
tfidf_test_df['svm_pred'] = SVM.predict(vectorized_embeddings)

In [None]:
tfidf_test_df[['knn_pred', 'svm_pred', 'rf_pred', 'code']]

In [None]:
from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        doc2vec_test_df['{}_pred'.format(classifier)], 
                                        doc2vec_test_df['code']
                                    ),
                                    f1_score(
                                        doc2vec_test_df['{}_pred'.format(classifier)],
                                        doc2vec_test_df['code'], average = 'macro')
                                   )
     )

In [None]:
# svm accuracy still tanks, potentially overfitting. the problem is too many output classes. 
# to mitigate, build hierarchical model