In [1]:
import pickle
import pandas as pd
from scripts.Embedder import Embedder, tfidfEmbedder
from scripts.OccupationPreprocessor import OccupationPreprocessor
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gradlab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
with open('first_dig_tfidf_clfs.pkl', 'rb') as f:
    clf1 = pickle.load(f)
    
with open('second_third_fourth_dig_tfidf_clfs.pkl', 'rb') as f2:
    clf2 = pickle.load(f2)

In [3]:
TEST_FILE = 1

In [4]:
if TEST_FILE == 0:
    tfidf_test_set = OccupationPreprocessor.prepare_df(
        './Data/ATP_gold_standard.xlsx',
        input_column='CURRENT_JOB_TITLE',
        code_column='NOC code by PATH',
        n_digits=4
    )
    
    NOCv5 = OccupationPreprocessor.prepare_df(
        './Data/ATP_gold_standard.xlsx',
        input_column='NOC code by PATH',
        code_column='V5_NOC',
        n_digits=4
    )
    
    NOCv6 = OccupationPreprocessor.prepare_df(
        './Data/ATP_gold_standard.xlsx',
        input_column='NOC code by PATH',
        code_column='V6_NOC',
        n_digits=4
    )
    
elif TEST_FILE == 1:
    tfidf_test_set = pd.DataFrame(
        pd.read_csv('./Data/tfidf_test_set.csv')
    )
    
tfidf_train_set = pd.DataFrame(
    pd.read_csv('./Data/tfidf_train_set.csv')
)

In [5]:
embedder = tfidfEmbedder()

Model loaded from ./vectorizer.joblib


In [60]:
sample_size = 500 if TEST_FILE == 0 else 200
sample_pipeline_df = tfidf_test_set.sample(sample_size).drop_duplicates()

### Check exact matches

In [61]:
tfidf_test_vectors = embedder.embed(sample_pipeline_df['input'])

In [62]:
from sklearn.metrics import accuracy_score, f1_score
tfidf_test_df = pd.DataFrame({
    'svm_pred':clf1['SVM'].predict(tfidf_test_vectors),
    'rf_pred':clf1['RF'].predict(tfidf_test_vectors),
    'knn_pred':clf1['KNN'].predict(tfidf_test_vectors),
    'code':sample_pipeline_df['code'].astype(str)
})

In [63]:
tfidf_test_df['exact_match'] = sample_pipeline_df['input'].apply(embedder.check_exact_match, args=(embedder.train_database,))

In [64]:
tfidf_test_df['p_all'] = tfidf_test_df.apply(tfidfEmbedder.ensemble_predict, axis = 1, args = (
    ['rf_pred','svm_pred','knn_pred'], 'svm_pred',
))

In [65]:
tfidf_test_df['vectors'] = tfidf_test_vectors.toarray().tolist()

In [66]:
# put in pipeline.py
def pipeline(row):
    np_array = np.array(row['vectors']).reshape(1, -1)
    p_1 = row['p_all']
    row['svm_pred_234'] = clf2[p_1]['SVM'].predict(np_array)[0]
    row['rf_pred_234'] = clf2[p_1]['RF'].predict(np_array)[0]
    row['knn_pred_234'] = clf2[p_1]['KNN'].predict(np_array)[0]
    return row

In [67]:
import time
start_time = time.time()
tfidf_test_df = tfidf_test_df.apply(pipeline, axis = 1)
elapsed = time.time()-start_time
print('time to predict on {0} samples: {1:.2f} seconds ({2:.3f} s/sample)'.format(sample_pipeline_df.shape[0],
                                                         elapsed, 
                                                        elapsed/sample_pipeline_df.shape[0])
     )

time to predict on 180 samples: 42.10 seconds (0.23 s/sample)


In [68]:
tfidf_test_df['p_all_234'] = tfidf_test_df.apply(tfidfEmbedder.ensemble_predict, axis = 1, args = (
    ['svm_pred_234','rf_pred_234','knn_pred_234'], 'knn_pred_234',
))

### performance w/o exact match

In [69]:
from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        tfidf_test_df['{}_pred_234'.format(classifier)].astype(int), 
                                        tfidf_test_df['code'].astype(int)
                                    ),
                                    f1_score(
                                        tfidf_test_df['{}_pred_234'.format(classifier)].astype(int),
                                        tfidf_test_df['code'].astype(int), average = 'macro')
                                   )
     )
print('Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
    accuracy_score(tfidf_test_df['p_all_234'].astype(int), tfidf_test_df['code'].astype(int)), 
    f1_score(tfidf_test_df['p_all_234'].astype(int), tfidf_test_df['code'].astype(int), average = 'macro')
))

KNN acc:0.55, f1-macro:0.3953041991121461
SVM acc:0.5388888888888889, f1-macro:0.38720655688397615
RF acc:0.5666666666666667, f1-macro:0.40760413295624565
Ensemble metrics total. Accuracy:0.5722222222222222, f1:0.4094309969309969



### With exact match added

In [70]:
tfidf_test_df['Hybrid_pred_exact_match'] = tfidf_test_df.apply(
    lambda row : row['p_all_234'] if row['exact_match'] == -1 else row['exact_match'], axis = 1
)
print('Ensemble metrics after checking for exact match total. Accuracy:{}, f1:{}\n'.format(
    accuracy_score(tfidf_test_df['Hybrid_pred_exact_match'].astype(int), tfidf_test_df['code'].astype(int)), 
    f1_score(tfidf_test_df['Hybrid_pred_exact_match'].astype(int), tfidf_test_df['code'].astype(int), average = 'macro')
))

Ensemble metrics after checking for exact match total. Accuracy:0.5722222222222222, f1:0.4094309969309969



In [71]:
def clean_noc_double_codes(row, column):
    if ',' in row[column]:
        row[column] = str(row[column].strip('\'').split(',')[1])
    return row

In [72]:
if TEST_FILE == 0:
    NOCv5 = NOCv5.apply(clean_noc_double_codes, axis =1 , args = ('input',))
    NOCv6 = NOCv6.apply(clean_noc_double_codes, axis =1 , args = ('input',))
    print('NOCv5 Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
        accuracy_score(NOCv5['input'].astype(int), NOCv5['code'].astype(int)), 
        f1_score(NOCv5['input'].astype(int), NOCv5['code'].astype(int), average = 'macro')
    ))
    print('NOCv6 Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
        accuracy_score(NOCv6['input'].astype(int), NOCv6['code'].astype(int)), 
        f1_score(NOCv6['input'].astype(int), NOCv6['code'].astype(int), average = 'macro')
    ))

In [73]:
tfidf_test_df.drop(columns =['vectors'])

Unnamed: 0,svm_pred,rf_pred,knn_pred,code,exact_match,p_all,svm_pred_234,rf_pred_234,knn_pred_234,p_all_234,Hybrid_pred_exact_match
21471,6,9,6,6733,-1,6,6733,6421,6733,6733,6733
24101,1,1,1,1243,-1,1,1243,1243,1243,1243,1243
2441,0,0,0,122,-1,0,12,213,601,601,601
21706,0,0,0,423,-1,0,631,821,821,821,821
27578,7,7,7,7284,-1,7,7284,7284,7284,7284,7284
...,...,...,...,...,...,...,...,...,...,...,...
1983,6,6,6,6523,-1,6,6524,6421,6524,6524,6524
5007,0,0,0,114,-1,0,811,114,114,114,114
1528,7,7,7,7301,-1,7,7201,7301,7301,7301,7301
17335,0,0,0,421,-1,0,311,311,212,311,311
