In [36]:
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle

from scripts.TextPreprocessor import TextPreprocessor
from scripts.OccupationPreprocessor import OccupationPreprocessor
from scripts.TrainEngine import TrainEngine
from scripts.Embedder import Embedder

In [2]:
df = pd.read_csv('./data/noc_data_get_byws_dealing_slash.csv')
df = df.apply(OccupationPreprocessor.extract_job_samples, axis = 1)
train_df = pd.DataFrame(dict(OccupationPreprocessor.all_job_samples).items(), columns=['input', 'code'])

In [3]:
# Load ATP data
ATP_data = OccupationPreprocessor.prepare_df('./Data/V5_Run Input(1).xlsx', 
                                             input_column='Current Job Title',
                                            code_column='NOC code ',
                                             n_digits=4
                                            )


Input unprocessed by default


# Shuffle ATP and split into train-val sections 

In [4]:
shuffled_ATP_df = ATP_data.sample(frac=1, random_state=42)

# Sample size of ATP used for training 
ATP_train_size = 8000

# Split  dataset 
ATP_data_train_set = shuffled_ATP_df[:ATP_train_size]
ATP_data_test_set = shuffled_ATP_df[ATP_train_size:]

# Combine both train sets

In [5]:
train_df = train_df.append(ATP_data_train_set)

In [6]:
uncleaned_train_df = train_df.copy() # for tfidf, note: that tfidf lowercases regardless
cleaned_train_df = train_df.copy() # for doc2vec
cleaned_train_df['input'] = cleaned_train_df['input'].apply(TextPreprocessor.preprocess_text)

In [7]:
TRIAL_NAME = 'trial_11'

doc2vec_params = dict(
epochs = 6144, # training cycles
vec_size = 64, # specific to doc2vec, size of the output vector
alpha = 0.001, # learning rate
window = 3,
min_count = 2,
min_alpha = 0.00025
)

embedder = Embedder(
    d2v_trial_name=TRIAL_NAME,
    d2v_params=doc2vec_params,
    train_data = uncleaned_train_df,
    corpus_column = 'input',
    infer_params = {
        'steps':2048,
        'alpha':0.03
    }
)

Defaulted doc2vec param: dm=1


In [8]:
embedder.train_tfidf()

TF-IDF training vector shape (37745, 2633)


<37745x2633 sparse matrix of type '<class 'numpy.float64'>'
	with 108442 stored elements in Compressed Sparse Row format>

# Apply embeddings to training data

## Embed X_train

In [9]:
TARGET_CODE_LENGTH = 1

In [10]:
tfidf_train_vectors = Embedder.get_tfidf_embeddings(embedder, uncleaned_train_df['input'])

## Get first n digits of y_train

In [69]:
X_train = tfidf_train_vectors
y_train = np.array(uncleaned_train_df['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,))
)

In [70]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = pd.Series(y_train[:-8000]).value_counts().index,
        y=pd.Series(y_train[:-8000]).value_counts()
    )
)
fig.update_layout(
title = 'NOC data: frequency bar plot of each leading digit',
    yaxis_title = 'Count',
    xaxis_title = 'First Digit'
)
fig.update_xaxes(tickvals=list(sorted(pd.Series(y_train).unique())))
fig.show()

In [71]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = pd.Series(y_train[-8000:]).value_counts().index,
        y=pd.Series(y_train[-8000:]).value_counts()
    )
)
fig.update_layout(
title = 'ATP data: frequency bar plot of each leading digit',
    yaxis_title = 'Count',
    xaxis_title = 'First Digit'
)
fig.update_xaxes(tickvals=list(sorted(pd.Series(y_train).unique())))
fig.show()

In [72]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = pd.Series(y_train).value_counts().index,
        y=pd.Series(y_train).value_counts()
    )
)
fig.update_layout(
title = 'All Train Data: frequency bar plot of each leading digit',
    yaxis_title = 'Count',
    xaxis_title = 'First Digit'
)
fig.update_xaxes(tickvals=list(sorted(pd.Series(y_train).unique())))
fig.show()

## Embed X_test

In [73]:
tfidf_test_vectors = Embedder.get_tfidf_embeddings(embedder, ATP_data_test_set['input'])

## Get first n digits of y_test

In [74]:
X_test = tfidf_test_vectors
y_test = np.array(ATP_data_test_set['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,))
)

# Build preliminary classifiers

In [75]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [76]:
import time

# Train TFIDF

In [99]:
SVM = SVC(class_weight='balanced', kernel='linear')

start = time.time()
SVM.fit(X_train, y_train)
print('SVM training duration: {} seconds'.format(time.time()-start))

SVM training duration: 23.338071823120117 seconds


In [100]:
RF = RandomForestClassifier(n_estimators=24, max_depth=64, n_jobs=-1, warm_start=True)

start = time.time()
RF.fit(X_train, y_train)
print('RF training duration: {} seconds'.format(time.time()-start))

RF training duration: 0.3747367858886719 seconds


In [101]:
KNN = KNeighborsClassifier(n_neighbors = 4, n_jobs=-1)

start = time.time()
KNN.fit(X_train, y_train)
print('KNN training duration: {} seconds'.format(time.time()-start))

KNN training duration: 0.030236005783081055 seconds


# Predict TFIDF

In [81]:
start = time.time()
svm_pred = SVM.predict(X_test)
print('SVM prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

SVM prediction duration on 32024 samples: 14.236286163330078 seconds


In [82]:
start = time.time()
rf_pred = RF.predict(X_test)
print('RF prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

RF prediction duration on 32024 samples: 0.11752104759216309 seconds


In [83]:
start = time.time()
knn_pred = KNN.predict(X_test)
print('KNN prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

KNN prediction duration on 32024 samples: 35.752992391586304 seconds


# Get Metrics

In [84]:
from sklearn.metrics import accuracy_score, f1_score
tfidf_test_df = pd.DataFrame({
    'svm_pred':svm_pred,
    'rf_pred':rf_pred,
    'knn_pred':knn_pred,
    'code':y_test
})

from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        tfidf_test_df['{}_pred'.format(classifier)], 
                                        y_test
                                    ),
                                    f1_score(
                                        tfidf_test_df['{}_pred'.format(classifier)],
                                        y_test, average = 'macro')
                                   )
     )

KNN acc:0.7926555083687234, f1-macro:0.7216547559620831
SVM acc:0.809611541343992, f1-macro:0.7237900698699121
RF acc:0.6793342493130152, f1-macro:0.6210014893571093


In [85]:
from collections import Counter
def ensemble_predict(row, predictor_cols, default_predictor):

    # find majority vote for all methods, :-1 drops ground truth column
    votes = Counter(row[predictor_cols]).most_common(1)
    
    # take svm as tie-breaker because CURRENTLY most accurate
    winning_class, highest_num_votes = votes[0]
    return winning_class
    return row[default_predictor] if highest_num_votes < 2 else winning_class

In [86]:
tfidf_test_df['p_all'] = tfidf_test_df.apply(ensemble_predict, axis = 1, args = (
    ['rf_pred','svm_pred','knn_pred'], 'svm_pred',
))

In [87]:
tfidf_test_df.iloc[:, :10][['p_all','code']]

Unnamed: 0,p_all,code
0,1,1
1,0,0
2,9,7
3,0,0
4,4,4
...,...,...
32019,1,1
32020,3,3
32021,1,1
32022,9,0


In [88]:
print('Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(accuracy_score(tfidf_test_df['p_all'], y_test), 
                                      f1_score(tfidf_test_df['p_all'], y_test, average = 'macro')))
for target in list(sorted(pd.Series(y_test).unique())):
    print("---- Target =", target)
    print('Accuracy:{:.2f}'.format(
            accuracy_score(
                tfidf_test_df['knn_pred'].loc[y_test == target], 
                pd.Series(y_test).loc[y_test == target]
            )
        )
    )
          

Ensemble metrics total. Accuracy:0.8049587809143143, f1:0.7201506823059095

---- Target = 0
Accuracy:0.84
---- Target = 1
Accuracy:0.83
---- Target = 2
Accuracy:0.71
---- Target = 3
Accuracy:0.91
---- Target = 4
Accuracy:0.78
---- Target = 5
Accuracy:0.77
---- Target = 6
Accuracy:0.74
---- Target = 7
Accuracy:0.72
---- Target = 8
Accuracy:0.36
---- Target = 9
Accuracy:0.61


In [89]:
# make graphs
graph_df = pd.DataFrame(columns=['knn', 'rf', 'svm', 'target'])

graph_df['target'] = list(sorted(pd.Series(y_test).unique()))

for clf in graph_df.columns:
    if clf != 'target':
        graph_df[clf] = [accuracy_score(
                            tfidf_test_df['{}_pred'.format(clf)].loc[y_test == target], 
                            pd.Series(y_test).loc[y_test == target]
                        ) for target in graph_df['target']]
        
graph_df.index = graph_df['target']

In [90]:
fig = go.Figure()
for clf in ['knn', 'rf', 'svm']:
    fig.add_trace(go.Bar(
        y = graph_df[clf],
        name = clf.upper(),
        texttemplate=list(graph_df[clf].apply(lambda x : np.round(x, 3)))
                      , textposition='outside')
         )


fig.update_layout(
title = 'Per Digit Accuracy, Grouped by Classifier - After Undersampling',
    yaxis_title = 'Raw Accuracy',
    xaxis_title = 'First Digit',
    barmode='group', 
    xaxis_tickangle=-45
)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = list(range(10))
    )
)
fig.show()

# Save 1st digit classifiers

In [91]:
output_path = 'first_dig_tfidf_clfs.pkl'

try:
    assert not os.path.exists(output_path), "Existing model found"
    overwrite = 'y'
except AssertionError as e:
    print(e)
    overwrite = input("Overwrite? y|n: ")

if overwrite.lower() == 'y':
    with open(output_path, 'wb') as f:
        classifiers = {
            'SVM':SVM,
            'RF':RF,
            'KNN':KNN
        }
        pickle.dump(classifiers, f)
    print("Model Overwritten")
else:
    print("Previous Model Kept")

Existing model found
Overwrite? y|n: y
Model Overwritten


# 2nd level (digits 2, 3, 4)

In [92]:
second_level_classifiers = {
    # keys are first level outputs, value is a dictionary with keys 'svm', 'knn', 'rf' and 
}
# set all classifiers to empty
for key in pd.Series(y_train).unique():
    second_level_classifiers[key] = {}

In [94]:
first_digit = y_train

In [95]:
len(y_train)

37745

In [103]:
y_train_digs_2_3_4 = np.array(train_df['code'])

for key in pd.Series(y_train).unique():

    filtered_X_train = Embedder.get_tfidf_embeddings(embedder, uncleaned_train_df.loc[first_digit == key]['input'])
    filtered_y_train = pd.Series(y_train_digs_2_3_4).loc[first_digit == key]
    print("Training for {} on {} classes".format(key, len(filtered_y_train.unique())))

    # --------------------SVM
    SVM2 = SVC(class_weight='balanced', kernel='linear')

    start = time.time()
    SVM2.fit(filtered_X_train, filtered_y_train)
    print('SVM training duration: {} seconds'.format(time.time()-start))
    
    # save to dict
    second_level_classifiers[key]['SVM'] = SVM2
    
    # --------------------RF
    RF2 = RandomForestClassifier(n_estimators=24, max_depth=64, n_jobs=-1, warm_start=True)

    start = time.time()
    RF2.fit(filtered_X_train, filtered_y_train)
    print('RF training duration: {} seconds'.format(time.time()-start))
    
    # save to dict
    second_level_classifiers[key]['RF'] = RF2
    
    # --------------------KNN
    KNN2 = KNeighborsClassifier(n_neighbors = 4, n_jobs=-1)

    start = time.time()
    KNN2.fit(filtered_X_train, filtered_y_train)
    print('KNN training duration: {} seconds'.format(time.time()-start))
    
    # save to dict
    second_level_classifiers[key]['KNN'] = KNN2

Training for 0 on 48 classes
SVM training duration: 0.6110682487487793 seconds
RF training duration: 0.23415064811706543 seconds
KNN training duration: 0.0 seconds
Training for 1 on 54 classes
SVM training duration: 0.46219587326049805 seconds
RF training duration: 0.13604021072387695 seconds
KNN training duration: 0.0 seconds
Training for 2 on 62 classes
SVM training duration: 0.4045391082763672 seconds
RF training duration: 0.13394951820373535 seconds
KNN training duration: 0.0 seconds
Training for 3 on 36 classes
SVM training duration: 0.14716625213623047 seconds
RF training duration: 0.13807892799377441 seconds
KNN training duration: 0.0 seconds
Training for 4 on 38 classes
SVM training duration: 0.2898402214050293 seconds
RF training duration: 0.11700677871704102 seconds
KNN training duration: 0.0 seconds
Training for 5 on 33 classes
SVM training duration: 0.13154387474060059 seconds
RF training duration: 0.13152575492858887 seconds
KNN training duration: 0.0 seconds
Training for 

# Save 2nd digit classifiers

In [106]:
output_path = 'second_third_fourth_dig_tfidf_clfs.pkl'

try:
    assert not os.path.exists(output_path), "Existing model found"
    overwrite = 'y'
except AssertionError as e:
    print(e)
    overwrite = input("Overwrite? y|n: ")

if overwrite.lower() == 'y':
    with open(output_path, 'wb') as f2:
        pickle.dump(second_level_classifiers, f2)
    print("Model Saved/Overwritten")
else:
    print("Previous Model Kept")

Existing model found
Overwrite? y|n: n
Previous Model Kept
