In [None]:
import pickle
import pandas as pd
from scripts.Embedder import Embedder, tfidfEmbedder
from scripts.OccupationPreprocessor import OccupationPreprocessor
import numpy as np

In [None]:
with open('first_dig_tfidf_clfs.pkl', 'rb') as f:
    clf1 = pickle.load(f)
    
with open('second_third_fourth_dig_tfidf_clfs.pkl', 'rb') as f2:
    clf2 = pickle.load(f2)

In [None]:
TEST_FILE = 3

In [None]:
if TEST_FILE == 0:
    tfidf_test_set = OccupationPreprocessor.prepare_df(
        './Data/ATP_gold_standard.xlsx',
        input_column='CURRENT_JOB_TITLE',
        code_column='NOC code by PATH',
        n_digits=4
    )
    
    NOCv5 = OccupationPreprocessor.prepare_df(
        './Data/ATP_gold_standard.xlsx',
        input_column='NOC code by PATH',
        code_column='V5_NOC',
        n_digits=4
    )
    
    NOCv6 = OccupationPreprocessor.prepare_df(
        './Data/ATP_gold_standard.xlsx',
        input_column='NOC code by PATH',
        code_column='V6_NOC',
        n_digits=4
    )
    
elif TEST_FILE == 1:
    tfidf_test_set = pd.DataFrame(
        pd.read_csv('./Data/tfidf_test_set.csv')
    )
    
# A-B test set with train data
elif TEST_FILE == 2:
    tfidf_test_set = pd.DataFrame(
        pd.read_csv('overlap_test_set_v4_acanoc_with_train_data.csv')
    )
    
# A-B test set no train data
elif TEST_FILE == 3:
    tfidf_test_set = pd.DataFrame(
        pd.read_csv('overlap_test_set_v4_acanoc_no_train_data.csv')
    )
    
tfidf_train_set = pd.DataFrame(
    pd.read_csv('./Data/tfidf_train_set.csv')
)

In [None]:
embedder = tfidfEmbedder()

In [None]:
sample_size = 500 if TEST_FILE == 0 else 500
sample_pipeline_df = tfidf_test_set.sample(sample_size, random_state=123)#.drop_duplicates()

### Metrics for alternative classifiers

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def clean_noc_double_codes(row, column):
    if ',' in row[column]:
        row[column] = str(row[column].strip('\'').split(',')[1])
    return row

if TEST_FILE == 0:
    NOCv5 = NOCv5.apply(clean_noc_double_codes, axis =1 , args = ('input',))
    NOCv6 = NOCv6.apply(clean_noc_double_codes, axis =1 , args = ('input',))
    print('NOCv5 Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
        accuracy_score(NOCv5['input'].astype(int), NOCv5['code'].astype(int)), 
        f1_score(NOCv5['input'].astype(int), NOCv5['code'].astype(int), average = 'macro')
    ))
    print('NOCv6 Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
        accuracy_score(NOCv6['input'].astype(int), NOCv6['code'].astype(int)), 
        f1_score(NOCv6['input'].astype(int), NOCv6['code'].astype(int), average = 'macro')
    ))
elif TEST_FILE == 2 or TEST_FILE == 3:
    print(
        "Accuracy to Beat:",
        accuracy_score(
            sample_pipeline_df['code'], 
            sample_pipeline_df['v4_pred']
        )
    )

### Check exact matches

In [None]:
tfidf_test_vectors = embedder.embed(sample_pipeline_df['input'])

In [None]:
tfidf_test_df = pd.DataFrame({
    'svm_pred':clf1['SVM'].predict(tfidf_test_vectors),
    'rf_pred':clf1['RF'].predict(tfidf_test_vectors),
    'knn_pred':clf1['KNN'].predict(tfidf_test_vectors),
    'code':sample_pipeline_df['code'].astype(str)
})

In [None]:
tfidf_test_df['exact_match'] = sample_pipeline_df['input'].apply(embedder.check_exact_match, args=(embedder.train_database,))

In [None]:
tfidf_test_df['p_all'] = tfidf_test_df.apply(tfidfEmbedder.ensemble_predict, axis = 1, args = (
    ['rf_pred','svm_pred','knn_pred'], 'svm_pred',
))

In [None]:
tfidf_test_df['vectors'] = tfidf_test_vectors.toarray().tolist()

In [None]:
# put in pipeline.py
def pipeline(row):
    np_array = np.array(row['vectors']).reshape(1, -1)
    p_1 = row['p_all']
    row['svm_pred_234'] = clf2[p_1]['SVM'].predict(np_array)[0]
    row['rf_pred_234'] = clf2[p_1]['RF'].predict(np_array)[0]
    row['knn_pred_234'] = clf2[p_1]['KNN'].predict(np_array)[0]
    return row

In [None]:
import time
start_time = time.time()
tfidf_test_df = tfidf_test_df.apply(pipeline, axis = 1)
elapsed = time.time()-start_time
print('time to predict on {0} samples: {1:.2f} seconds ({2:.3f} s/sample)'.format(sample_pipeline_df.shape[0],
                                                         elapsed, 
                                                        elapsed/sample_pipeline_df.shape[0])
     )

In [None]:
tfidf_test_df['p_all_234'] = tfidf_test_df.apply(tfidfEmbedder.ensemble_predict, axis = 1, args = (
    ['svm_pred_234','rf_pred_234','knn_pred_234'], 'knn_pred_234',
))

### performance w/o exact match

In [None]:
from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        tfidf_test_df['{}_pred_234'.format(classifier)].astype(int), 
                                        tfidf_test_df['code'].astype(int)
                                    ),
                                    f1_score(
                                        tfidf_test_df['{}_pred_234'.format(classifier)].astype(int),
                                        tfidf_test_df['code'].astype(int), average = 'macro')
                                   )
     )
print('Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(
    accuracy_score(tfidf_test_df['p_all_234'].astype(int), tfidf_test_df['code'].astype(int)), 
    f1_score(tfidf_test_df['p_all_234'].astype(int), tfidf_test_df['code'].astype(int), average = 'macro')
))

### With exact match added

In [None]:
tfidf_test_df['Hybrid_pred_exact_match'] = tfidf_test_df.apply(
    lambda row : row['p_all_234'] if row['exact_match'] == -1 else row['exact_match'], axis = 1
)
print('Ensemble metrics after checking for exact match total. Accuracy:{}, f1:{}\n'.format(
    accuracy_score(tfidf_test_df['Hybrid_pred_exact_match'].astype(int), tfidf_test_df['code'].astype(int)), 
    f1_score(tfidf_test_df['Hybrid_pred_exact_match'].astype(int), tfidf_test_df['code'].astype(int), average = 'macro')
))

In [None]:
tfidf_test_df.drop(columns =['vectors'])