In [1]:
import pickle
import pandas as pd
from scripts.Embedder import Embedder, tfidfEmbedder
from scripts.OccupationPreprocessor import OccupationPreprocessor
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gradlab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
with open('first_dig_tfidf_clfs.pkl', 'rb') as f:
    clf1 = pickle.load(f)
    
with open('second_third_fourth_dig_tfidf_clfs.pkl', 'rb') as f2:
    clf2 = pickle.load(f2)

In [3]:
TEST_FILE = 3

In [4]:
if TEST_FILE == 0:
    tfidf_test_set = OccupationPreprocessor.prepare_df(
        './Data/ATP_gold_standard.xlsx',
        input_column='CURRENT_JOB_TITLE',
        code_column='NOC code by PATH',
        n_digits=4
    )
    
    NOCv5 = OccupationPreprocessor.prepare_df(
        './Data/ATP_gold_standard.xlsx',
        input_column='NOC code by PATH',
        code_column='V5_NOC',
        n_digits=4
    )
    
    NOCv6 = OccupationPreprocessor.prepare_df(
        './Data/ATP_gold_standard.xlsx',
        input_column='NOC code by PATH',
        code_column='V6_NOC',
        n_digits=4
    )
    
elif TEST_FILE == 1:
    tfidf_test_set = pd.DataFrame(
        pd.read_csv('./Data/tfidf_test_set.csv')
    )
    
# A-B test set with train data
elif TEST_FILE == 2:
    tfidf_test_set = pd.DataFrame(
        pd.read_csv('overlap_test_set_v4_acanoc_with_train_data.csv')
    )
    
# A-B test set no train data
elif TEST_FILE == 3:
    tfidf_test_set = pd.DataFrame(
        pd.read_csv('overlap_test_set_v4_acanoc_no_train_data.csv')
    )
    
tfidf_train_set = pd.DataFrame(
    pd.read_csv('./Data/tfidf_train_set.csv')
)

In [5]:
embedder = tfidfEmbedder()

Model loaded from ./vectorizer.joblib


In [6]:
sample_size = 500 if TEST_FILE == 0 else 500
sample_pipeline_df = tfidf_test_set.sample(sample_size, random_state=123)#.drop_duplicates()

### Metrics for alternative classifiers

In [7]:
from sklearn.metrics import accuracy_score, f1_score
def clean_noc_double_codes(row, column):
    if ',' in row[column]:
        row[column] = str(row[column].strip('\'').split(',')[1])
    return row

if TEST_FILE == 0:
    NOCv5 = NOCv5.apply(clean_noc_double_codes, axis =1 , args = ('input',))
    NOCv6 = NOCv6.apply(clean_noc_double_codes, axis =1 , args = ('input',))
    print('NOCv5 Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
        accuracy_score(NOCv5['input'].astype(int), NOCv5['code'].astype(int)), 
        f1_score(NOCv5['input'].astype(int), NOCv5['code'].astype(int), average = 'macro')
    ))
    print('NOCv6 Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
        accuracy_score(NOCv6['input'].astype(int), NOCv6['code'].astype(int)), 
        f1_score(NOCv6['input'].astype(int), NOCv6['code'].astype(int), average = 'macro')
    ))
elif TEST_FILE == 2 or TEST_FILE == 3:
    print(
        "Accuracy to Beat:",
        accuracy_score(
            sample_pipeline_df['code'], 
            sample_pipeline_df['v4_pred']
        )
    )

Accuracy to Beat: 0.414


### Check exact matches

In [8]:
tfidf_test_vectors = embedder.embed(sample_pipeline_df['input'])

In [9]:
tfidf_test_df = pd.DataFrame({
    'svm_pred':clf1['SVM'].predict(tfidf_test_vectors),
    'rf_pred':clf1['RF'].predict(tfidf_test_vectors),
    'knn_pred':clf1['KNN'].predict(tfidf_test_vectors),
    'code':sample_pipeline_df['code'].astype(str)
})

In [10]:
tfidf_test_df['exact_match'] = sample_pipeline_df['input'].apply(embedder.check_exact_match, args=(embedder.train_database,))

In [11]:
tfidf_test_df['p_all'] = tfidf_test_df.apply(tfidfEmbedder.ensemble_predict, axis = 1, args = (
    ['rf_pred','svm_pred','knn_pred'], 'svm_pred',
))

In [12]:
tfidf_test_df['vectors'] = tfidf_test_vectors.toarray().tolist()

In [13]:
# put in pipeline.py
def pipeline(row):
    np_array = np.array(row['vectors']).reshape(1, -1)
    p_1 = row['p_all']
    row['svm_pred_234'] = clf2[p_1]['SVM'].predict(np_array)[0]
    row['rf_pred_234'] = clf2[p_1]['RF'].predict(np_array)[0]
    row['knn_pred_234'] = clf2[p_1]['KNN'].predict(np_array)[0]
    return row

In [14]:
import time
start_time = time.time()
tfidf_test_df = tfidf_test_df.apply(pipeline, axis = 1)
elapsed = time.time()-start_time
print('time to predict on {0} samples: {1:.2f} seconds ({2:.3f} s/sample)'.format(sample_pipeline_df.shape[0],
                                                         elapsed, 
                                                        elapsed/sample_pipeline_df.shape[0])
     )

time to predict on 500 samples: 117.09 seconds (0.234 s/sample)


In [15]:
tfidf_test_df['p_all_234'] = tfidf_test_df.apply(tfidfEmbedder.ensemble_predict, axis = 1, args = (
    ['svm_pred_234','rf_pred_234','knn_pred_234'], 'knn_pred_234',
))

### performance w/o exact match

In [16]:
from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        tfidf_test_df['{}_pred_234'.format(classifier)].astype(int), 
                                        tfidf_test_df['code'].astype(int)
                                    ),
                                    f1_score(
                                        tfidf_test_df['{}_pred_234'.format(classifier)].astype(int),
                                        tfidf_test_df['code'].astype(int), average = 'macro')
                                   )
     )
print('Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
    accuracy_score(tfidf_test_df['p_all_234'].astype(int), tfidf_test_df['code'].astype(int)), 
    f1_score(tfidf_test_df['p_all_234'].astype(int), tfidf_test_df['code'].astype(int), average = 'macro')
))

KNN acc:0.412, f1-macro:0.28981338064635614
SVM acc:0.452, f1-macro:0.3476788298262049
RF acc:0.426, f1-macro:0.32403924624020314
Ensemble metrics total. Accuracy:0.454, f1:0.3399103944323261



### With exact match added

In [17]:
tfidf_test_df['Hybrid_pred_exact_match'] = tfidf_test_df.apply(
    lambda row : row['p_all_234'] if row['exact_match'] == -1 else row['exact_match'], axis = 1
)
print('Ensemble metrics after checking for exact match total. Accuracy:{}, f1:{}\n'.format(
    accuracy_score(tfidf_test_df['Hybrid_pred_exact_match'].astype(int), tfidf_test_df['code'].astype(int)), 
    f1_score(tfidf_test_df['Hybrid_pred_exact_match'].astype(int), tfidf_test_df['code'].astype(int), average = 'macro')
))

Ensemble metrics after checking for exact match total. Accuracy:0.456, f1:0.3402470407543739



In [18]:
tfidf_test_df.drop(columns =['vectors'])

Unnamed: 0,svm_pred,rf_pred,knn_pred,code,exact_match,p_all,svm_pred_234,rf_pred_234,knn_pred_234,p_all_234,Hybrid_pred_exact_match
8060,2,2,7,7384,-1,2,2242,2242,2242,2242,2242
661,0,0,0,822,-1,0,632,632,822,632,632
4593,8,9,8,8252,-1,8,8252,8252,8252,8252,8252
2932,4,4,4,4156,-1,4,4156,4163,4156,4156,4156
2019,2,2,0,2282,-1,2,2282,2282,2282,2282,2282
...,...,...,...,...,...,...,...,...,...,...,...
9872,4,1,0,1121,-1,4,4166,4161,4031,4031,4031
3160,4,4,4,4311,-1,4,4311,4311,4311,4311,4311
4860,1,1,1,1222,-1,1,1241,1222,1242,1242,1242
3697,5,9,6,6331,-1,5,5232,5244,5121,5121,5121
