In [3]:
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import os

from scripts.TextPreprocessor import TextPreprocessor
from scripts.OccupationPreprocessor import OccupationPreprocessor
from scripts.TrainEngine import TrainEngine
from scripts.Embedder import Embedder

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\augus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
df = pd.read_csv('./data/noc_data_get_byws_dealing_slash.csv')
df = df.apply(OccupationPreprocessor.extract_job_samples, axis = 1)
train_df = pd.DataFrame(dict(OccupationPreprocessor.all_job_samples).items(), columns=['input', 'code'])

In [5]:
# Load ATP data
ATP_data = OccupationPreprocessor.prepare_df('./Data/V5_Run Input(1).xlsx', 
                                             input_column='Current Job Title',
                                            code_column='NOC code ',
                                             n_digits=4
                                            )


Input unprocessed by default


# Shuffle ATP and split into train-val sections 

In [6]:
shuffled_ATP_df = ATP_data.sample(frac=1, random_state=42)

# Sample size of ATP used for training 
ATP_train_size = 8000

# Split  dataset 
ATP_data_train_set = shuffled_ATP_df[:ATP_train_size]
ATP_data_test_set = shuffled_ATP_df[ATP_train_size:]

# Combine both train sets

In [7]:
train_df = train_df.append(ATP_data_train_set)

In [8]:
uncleaned_train_df = train_df.copy() # for tfidf, note: that tfidf lowercases regardless
cleaned_train_df = train_df.copy() # for doc2vec
cleaned_train_df['input'] = cleaned_train_df['input'].apply(TextPreprocessor.preprocess_text)

In [9]:
TRIAL_NAME = 'trial_11'

doc2vec_params = dict(
epochs = 6144, # training cycles
vec_size = 64, # specific to doc2vec, size of the output vector
alpha = 0.001, # learning rate
window = 3,
min_count = 2,
min_alpha = 0.00025
)

embedder = Embedder(
    d2v_trial_name=TRIAL_NAME,
    d2v_params=doc2vec_params,
    train_data = uncleaned_train_df,
    corpus_column = 'input',
    infer_params = {
        'steps':2048,
        'alpha':0.03
    }
)

Defaulted doc2vec param: dm=1


In [10]:
embedder.train_tfidf()

TF-IDF training vector shape (37745, 2633)


<37745x2633 sparse matrix of type '<class 'numpy.float64'>'
	with 108442 stored elements in Compressed Sparse Row format>

# Apply embeddings to training data

## Embed X_train

In [11]:
TARGET_CODE_LENGTH = 2

In [12]:
tfidf_train_vectors = Embedder.get_tfidf_embeddings(embedder, uncleaned_train_df['input'])

## Get first n digits of y_train, get the column by which we separate classifiers

In [30]:
X_train = tfidf_train_vectors
y_train = train_df['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,)
)

In [31]:
first_digit = train_df['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(1,)
)

## Embed X_test

In [32]:
tfidf_test_vectors = Embedder.get_tfidf_embeddings(embedder, ATP_data_test_set['input'])

In [None]:
X_test = tfidf_test_vectors
y_test = np.array(ATP_data_test_set['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,))
)

## Get first n digits of y_test

In [16]:
second_level_classifiers = {
    # keys are first level outputs, value is a dictionary with keys 'svm', 'knn', 'rf' and 
}
# set all classifiers to empty
for key in first_digit.unique():
    second_level_classifiers[key] = {}

In [28]:
y_train

array(['00', '00', '00', ..., '30', '21', '72'], dtype=object)

# Train TFIDF

In [34]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [35]:
import time

In [36]:
for key in first_digit.unique():

    print("Training for", key)
    filtered_X_train = Embedder.get_tfidf_embeddings(embedder, uncleaned_train_df.loc[first_digit == key]['input'])
    filtered_y_train = pd.Series(y_train).loc[first_digit == key]

    # --------------------SVM
    SVM = SVC(class_weight='balanced', kernel='linear')

    start = time.time()
    SVM.fit(filtered_X_train, filtered_y_train)
    print('SVM training duration: {} seconds'.format(time.time()-start))
    
    # save to dict
    second_level_classifiers[key]['SVM'] = SVM
    
    # --------------------RF
    RF = RandomForestClassifier(n_estimators=256, max_depth=128, n_jobs=-1, warm_start=True)

    start = time.time()
    RF.fit(filtered_X_train, filtered_y_train)
    print('RF training duration: {} seconds'.format(time.time()-start))
    
    # save to dict
    second_level_classifiers[key]['RF'] = RF
    
    # --------------------KNN
    KNN = KNeighborsClassifier(n_neighbors = 4, n_jobs=-1)

    start = time.time()
    KNN.fit(filtered_X_train, filtered_y_train)
    print('KNN training duration: {} seconds'.format(time.time()-start))
    
    # save to dict
    second_level_classifiers[key]['KNN'] = KNN

SVM training duration: 0.4592933654785156 seconds
RF training duration: 1.9123950004577637 seconds
KNN training duration: 0.003981590270996094 seconds
SVM training duration: 0.279252290725708 seconds
RF training duration: 1.5468008518218994 seconds
KNN training duration: 0.003988742828369141 seconds
SVM training duration: 0.1077125072479248 seconds
RF training duration: 0.9747607707977295 seconds
KNN training duration: 0.004026174545288086 seconds
SVM training duration: 0.06254863739013672 seconds
RF training duration: 0.8690054416656494 seconds
KNN training duration: 0.002022266387939453 seconds
SVM training duration: 0.17053580284118652 seconds
RF training duration: 1.304694414138794 seconds
KNN training duration: 0.004025697708129883 seconds
SVM training duration: 0.05086398124694824 seconds
RF training duration: 0.7423279285430908 seconds
KNN training duration: 0.002997875213623047 seconds
SVM training duration: 0.20246028900146484 seconds
RF training duration: 1.4689247608184814 s

In [37]:
second_level_classifiers

{'0': {'SVM': SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
      max_iter=-1, probability=False, random_state=None, shrinking=True,
      tol=0.001, verbose=False),
  'RF': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=128, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=256,
                         n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                         warm_start=True),
  'KNN': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                       metric_params=None, n_jobs=-1, n_neighbors=4, p=2

# Predict TFIDF

In [32]:
start = time.time()
svm_pred = SVM.predict(X_test)
print('SVM prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

start = time.time()
rf_pred = RF.predict(X_test)
print('RF prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

start = time.time()
knn_pred = KNN.predict(X_test)
print('KNN prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

from sklearn.metrics import accuracy_score, f1_score
tfidf_test_df = pd.DataFrame({
    'svm_pred':svm_pred,
    'rf_pred':rf_pred,
    'knn_pred':knn_pred,
    'code':y_test
})

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        tfidf_test_df['{}_pred'.format(classifier)], 
                                        y_test
                                    ),
                                    f1_score(
                                        tfidf_test_df['{}_pred'.format(classifier)],
                                        y_test, average = 'macro')
                                   )
     )

KNN prediction duration on 32024 samples: 39.22178316116333 seconds


# Get Metrics

KNN acc:0.7903135148638522, f1-macro:0.7178526487613797
SVM acc:0.809580314763927, f1-macro:0.7237660576704875
RF acc:0.7518111416437672, f1-macro:0.6842288179637237


In [34]:
from collections import Counter
def ensemble_predict(row, predictor_cols, default_predictor):

    # find majority vote for all methods, :-1 drops ground truth column
    votes = Counter(row[predictor_cols]).most_common(1)
    
    # take svm as tie-breaker because CURRENTLY most accurate
    winning_class, highest_num_votes = votes[0]
    return winning_class
    return row[default_predictor] if highest_num_votes < 2 else winning_class

In [35]:
tfidf_test_df['p_all'] = tfidf_test_df.apply(ensemble_predict, axis = 1, args = (
    ['rf_pred','svm_pred','knn_pred'], 'svm_pred',
))

In [36]:
print('Ensemble acc:{}, f1:{}'.format(accuracy_score(tfidf_test_df['p_all'], y_test), 
                                      f1_score(tfidf_test_df['p_all'], y_test, average = 'macro')))
display("sample of test", tfidf_test_df.iloc[:5][['p_all','code']])

Ensemble acc:0.8142018486135398, f1:0.7312763970924928


'sample of test'

Unnamed: 0,p_all,code
0,1,1
1,0,0
2,9,7
3,0,0
4,4,4
