# Imports

In [1]:
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import os

from scripts.TextPreprocessor import TextPreprocessor
from scripts.OccupationPreprocessor import OccupationPreprocessor
from scripts.TrainEngine import TrainEngine
from scripts.Embedder import Embedder

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\augus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# if NOT working in colab
data_dir = './data'

# if working in colab
# data_dir = './'

## Load all NOC webpage data into separate dataframes

In [3]:
df_skill_type = pd.read_csv(os.path.join(data_dir, 'NOC_skilltype.csv'))
df_major_group = pd.read_csv(os.path.join(data_dir, './NOC_majorgroup.csv'))
df_minor_group = pd.read_csv(os.path.join(data_dir, './NOC_minorgroup.csv'))
df = pd.read_csv(os.path.join(data_dir, './noc_data_get_byws_dealing_slash.csv'))

In [4]:
# pad missing digits from noc codes
df['Noc_code'] = df['Noc_code'].apply(lambda x: '{0:0>4}'.format(x))

# Unpack all sample job titles in original df

In [5]:
# Do once, if 'noc_code' column already dropped, except to skip action
try:
    df = df.apply(OccupationPreprocessor.extract_job_samples, axis = 1)
except KeyError:
    pass

# Do same with descriptions

In [6]:
df = df.apply(OccupationPreprocessor.unpack_descriptions, axis = 1)

# Make training dataframe

In [7]:
train_df = pd.DataFrame(dict(OccupationPreprocessor.all_job_samples).items(), columns=['input', 'code'])

# Load ATP data for some train noise 

In [8]:
# Load ATP data
ATP_data = pd.DataFrame(pd.read_excel('./Data/V5_Run Input(1).xlsx'))

# Clean codes: many show up as ''0011 or '0011
ATP_data['code'] = ATP_data['NOC code '].apply(
    lambda x: int(x.strip('\''))
).apply(OccupationPreprocessor.first_n_digits, args=(4,))

ATP_data.drop(columns = ['NOC code '], inplace = True)

ATP_data['input'] = ATP_data['Current Job Title']
ATP_data.drop(columns = ['Current Job Title'], inplace = True)

# Shuffle ATP and split into train-val sections 

In [9]:
shuffled_ATP_df = ATP_data.sample(frac=1, random_state=42)

# Sample size of ATP used for training 
ATP_train_size = 8000

# Split  dataset 
ATP_data_train_set = shuffled_ATP_df[:ATP_train_size]
ATP_data_test_set = shuffled_ATP_df[ATP_train_size:]

# Combine both train sets

In [10]:
train_df = train_df[['input', 'code']]
ATP_data_train_set = ATP_data_train_set[['input', 'code']]
ATP_data_test_set = ATP_data_test_set[['input', 'code']]

train_df = train_df.append(ATP_data_train_set)

# Preprocess the entire train and test input

### Train

In [11]:
text_preprocessor = TextPreprocessor(strip_abbrev=True)
train_df['input'] = train_df['input'].apply(TextPreprocessor.preprocess_text)

In [12]:
print("Train samples before dropping duplicates", len(train_df))
train_df = train_df.drop_duplicates()
print("Train samples after dropping duplicates", len(train_df))

Train samples before dropping duplicates 37745
Train samples after dropping duplicates 33432


### Test

In [13]:
ATP_data_test_set['input'] = ATP_data_test_set['input'].apply(TextPreprocessor.preprocess_text)

In [14]:
print("Test samples before dropping duplicates", len(ATP_data_test_set))
ATP_data_test_set = ATP_data_test_set.drop_duplicates()
print("Test samples after dropping duplicates", len(ATP_data_test_set))

Test samples before dropping duplicates 32024
Test samples after dropping duplicates 14327


# Grab sample to see if preprocessing worked

In [15]:
def check(string):
    try:
        assert '.' not in string \
            and ',' not in string \
                and ')' not in string \
                    and '(' not in string \
                        and '-' not in string \
                            and ';' not in string \
                                and '/' not in string \
                                    and '\'' not in string
    except AssertionError:
        print(string)

In [16]:
train_df['input'].apply(check)
display(train_df.sample(20))

Unnamed: 0,input,code
4666,animal nutritionist,2121
14970,industrial electrician apprentice,7242
24418,trimming machine operator woodworking,9437
10189,aviation and space museum curator,5112
15130,cable installer telecommunications,7245
7004,coronary unit nurse,3012
2095,marine operations manager,731
12594,freight services sales representative,6411
34416,instructor 1,4021
4541,physiological chemist,2112


## Start Doc2vec code

In [17]:
TRIAL_NAME = 'trial_11'

doc2vec_params = dict(
epochs = 6144, # training cycles
vec_size = 64, # specific to doc2vec, size of the output vector
alpha = 0.001, # learning rate
window = 3,
min_count = 2,
min_alpha = 0.00025
)

embedder = Embedder(train_data = train_df,
                    corpus_column = 'input',
                    d2v_trial_name=TRIAL_NAME, 
                    d2v_params=doc2vec_params)

Defaulted doc2vec param: dm=1


In [18]:
embedder.train_doc2vec()

NameError: name 'tagged_data' is not defined

In [None]:
embedder.load_doc2vec_model()

In [None]:
len(embedder.train_df)

In [None]:
test_occupations = ['doctor', 'athlete', 'member of parliament',
                    'teacher', 'researcher', 'registered nurse', 
                    'CUSTOMER SERVICE', 'MANAGER OF CLEANING BUSINESS',
                   'CAREGIVER', 'Farm Boss']

for occ in test_occupations: 
    occ = TextPreprocessor.preprocess_text(occ)
    print(Embedder.infer_and_vote(occ, embedder, verbose=True))
    

# vectorize train data

In [None]:
embedder.train_tfidf()

# Apply embeddings to training data

In [None]:
assert 'input' in embedder.train_df.columns and 'code' in embedder.train_df.columns, "Make sure train dataframe has 'input' column and 'code' column"
train_d2v_embeddings = train_df['input'].apply(
    Embedder.get_doc2vec_embeddings, args=(embedder,)
)

In [None]:
assert 1 == 0, "Check trial 11 results before doing anything!"

## Embed X_train

In [None]:
TARGET_CODE_LENGTH = 4

In [None]:
d2v_train_vectors = Embedder.vectorize_embeddings(train_d2v_embeddings)
tfidf_train_vectors = Embedder.get_tfidf_embeddings(embedder, train_df['input'])

assert d2v_train_vectors.shape[0] == tfidf_train_vectors.shape[0]

## Get first n digits of y_train

In [None]:
X_train = tfidf_train_vectors # d2v_train_vectors
y_train = np.array(train_df['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,))
)

## Embed X_test

In [None]:
test_d2v_embeddings = ATP_data_test_set['input'].apply(
    Embedder.get_doc2vec_embeddings, args=(embedder,)
)
d2v_test_vectors = Embedder.vectorize_embeddings(test_d2v_embeddings)
tfidf_test_vectors = Embedder.get_tfidf_embeddings(embedder, ATP_data_test_set['input'])

assert d2v_test_vectors.shape[0] == tfidf_test_vectors.shape[0]

## Get first n digits of y_test

In [None]:
X_test = tfidf_test_vectors # d2v_test_vectors
y_test = np.array(ATP_data_test_set['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,))
)

# Build preliminary classifiers

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
import time

# Train TFIDF

In [None]:
SVM = SVC(class_weight='balanced', kernel='linear')

start = time.time()
SVM.fit(X_train, y_train)
print('SVM training duration: {} seconds'.format(time.time()-start))

In [None]:
RF = RandomForestClassifier(n_estimators=256, max_depth=128, n_jobs=-1, warm_start=True)

start = time.time()
RF.fit(X_train, y_train)
print('RF training duration: {} seconds'.format(time.time()-start))

In [None]:
KNN = KNeighborsClassifier(n_neighbors = 1, n_jobs=-1)

start = time.time()
KNN.fit(X_train, y_train)
print('KNN training duration: {} seconds'.format(time.time()-start))

# Predict TFIDF

In [None]:
start = time.time()
svm_pred = SVM.predict(X_test)
print('SVM prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

In [None]:
start = time.time()
rf_pred = RF.predict(X_test)
print('RF prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

In [None]:
start = time.time()
knn_pred = KNN.predict(X_test)
print('KNN prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

# Get Metrics

In [None]:
from sklearn.metrics import accuracy_score, f1_score
tfidf_test_df = pd.DataFrame({
    'svm_pred':svm_pred,
    'rf_pred':rf_pred,
    'knn_pred':knn_pred,
    'code':y_test
})

from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        tfidf_test_df['{}_pred'.format(classifier)], 
                                        y_test
                                    ),
                                    f1_score(
                                        tfidf_test_df['{}_pred'.format(classifier)],
                                        y_test, average = 'macro')
                                   )
     )

### Get this inside the embedder class

In [None]:
from collections import Counter
def ensemble_predict(row):

    # find majority vote for all methods, :-1 drops ground truth column
    votes = Counter(row[['rf_pred','svm_pred','knn_pred']]).most_common(1)
    
    # take svm as tie-breaker because CURRENTLY most accurate
    winning_class, highest_num_votes = votes[0]
    return winning_class
    return row['svm_pred'] if highest_num_votes < 2 else winning_class


# Ensemble Predict TFIDF

In [None]:
tfidf_test_df['p_all'] = tfidf_test_df.drop(columns=['code']).apply(ensemble_predict, axis = 1)

In [None]:
print('Ensemble acc:{}, f1:{}'.format(accuracy_score(tfidf_test_df['p_all'], y_test), 
                                      f1_score(tfidf_test_df['p_all'], y_test, average = 'macro')))
display(tfidf_test_df.iloc[:20][['p_all','code']])

# TODO: Adjust doc2vec testing to work with new code. Also get ensemble vote working for tfidf predictor
# Is preprocessing hurting the TFIDF?

In [None]:
doc2vec_test_df = ATP_data_test_set.sample(1200)

In [None]:
doc2vec_test_df['vote1'], doc2vec_test_df['vote2'], doc2vec_test_df['vote3'] = None, None, None
doc2vec_test_df[['vote1', 'vote2' ,'vote3']] = doc2vec_test_df['input'].apply(Embedder.infer_and_vote, args = (embedder, False,))
TPs = doc2vec_test_df.apply(lambda row: int(row['code']) in [row['vote1'], row['vote2'], row['vote3']], axis = 1)

In [None]:
embedder.infer_doc2vec('instructor')
Embedder.get_tfidf_embeddings(embedder, 'instructor')

In [None]:
df.loc[df['group_title'].str.contains('instructor')]

In [None]:
ATP_data_test_set.loc[ATP_data_test_set['input'] == str('instructor teacher')]

In [None]:
TPs.value_counts()

In [None]:
test_d2v_embeddings = doc2vec_test_df['input'].apply(get_doc2vec_embeddings, args=(embedder,))
doc2vec_test_df['doc2vec_embeddings'] = test_d2v_embeddings

In [None]:
vectorized_embeddings = vectorize_embeddings(test_d2v_embeddings)

In [None]:
tfidf_test_df['rf_pred'] = RF.predict(vectorized_embeddings)

In [None]:
tfidf_test_df['knn_pred'] = KNN.predict(vectorized_embeddings)

In [None]:
tfidf_test_df['svm_pred'] = SVM.predict(vectorized_embeddings)

In [None]:
tfidf_test_df[['knn_pred', 'svm_pred', 'rf_pred', 'code']]

In [None]:
from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        doc2vec_test_df['{}_pred'.format(classifier)], 
                                        doc2vec_test_df['code']
                                    ),
                                    f1_score(
                                        doc2vec_test_df['{}_pred'.format(classifier)],
                                        doc2vec_test_df['code'], average = 'macro')
                                   )
     )

In [None]:
# svm accuracy still tanks, potentially overfitting. the problem is too many output classes. 
# to mitigate, build hierarchical model