In [None]:
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import os

from scripts.TextPreprocessor import TextPreprocessor
from scripts.OccupationPreprocessor import OccupationPreprocessor
from scripts.TrainEngine import TrainEngine
from scripts.Embedder import Embedder

In [None]:
df = pd.read_csv('./data/noc_data_get_byws_dealing_slash.csv')
df = df.apply(OccupationPreprocessor.extract_job_samples, axis = 1)
train_df = pd.DataFrame(dict(OccupationPreprocessor.all_job_samples).items(), columns=['input', 'code'])

In [None]:
# Load ATP data
ATP_data = OccupationPreprocessor.prepare_df('./Data/V5_Run Input(1).xlsx', 
                                             input_column='Current Job Title',
                                            code_column='NOC code ',
                                             n_digits=4
                                            )


# Shuffle ATP and split into train-val sections 

In [None]:
shuffled_ATP_df = ATP_data.sample(frac=1, random_state=42)

# Sample size of ATP used for training 
ATP_train_size = 8000

# Split  dataset 
ATP_data_train_set = shuffled_ATP_df[:ATP_train_size]
ATP_data_test_set = shuffled_ATP_df[ATP_train_size:]

# Combine both train sets

In [None]:
train_df = train_df.append(ATP_data_train_set)

In [None]:
uncleaned_train_df = train_df.copy() # for tfidf, note: that tfidf lowercases regardless
cleaned_train_df = train_df.copy() # for doc2vec
cleaned_train_df['input'] = cleaned_train_df['input'].apply(TextPreprocessor.preprocess_text)

In [None]:
TRIAL_NAME = 'trial_11'

doc2vec_params = dict(
epochs = 6144, # training cycles
vec_size = 64, # specific to doc2vec, size of the output vector
alpha = 0.001, # learning rate
window = 3,
min_count = 2,
min_alpha = 0.00025
)

embedder = Embedder(
    d2v_trial_name=TRIAL_NAME,
    d2v_params=doc2vec_params,
    train_data = uncleaned_train_df,
    corpus_column = 'input',
    infer_params = {
        'steps':2048,
        'alpha':0.03
    }
)

In [None]:
embedder.train_tfidf()

# Apply embeddings to training data

## Embed X_train

In [None]:
TARGET_CODE_LENGTH = 1

In [None]:
tfidf_train_vectors = Embedder.get_tfidf_embeddings(embedder, uncleaned_train_df['input'])

## Get first n digits of y_train

In [None]:
X_train = tfidf_train_vectors
y_train = np.array(train_df['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,))
)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = pd.Series(y_train[:-8000]).value_counts().index,
        y=pd.Series(y_train[:-8000]).value_counts()
    )
)
fig.update_layout(
title = 'NOC data: frequency bar plot of each leading digit',
    yaxis_title = 'Count',
    xaxis_title = 'First Digit'
)
fig.update_xaxes(tickvals=list(sorted(pd.Series(y_train).unique())))
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = pd.Series(y_train[-8000:]).value_counts().index,
        y=pd.Series(y_train[-8000:]).value_counts()
    )
)
fig.update_layout(
title = 'ATP data: frequency bar plot of each leading digit',
    yaxis_title = 'Count',
    xaxis_title = 'First Digit'
)
fig.update_xaxes(tickvals=list(sorted(pd.Series(y_train).unique())))
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x = pd.Series(y_train).value_counts().index,
        y=pd.Series(y_train).value_counts()
    )
)
fig.update_layout(
title = 'All Train Data: frequency bar plot of each leading digit',
    yaxis_title = 'Count',
    xaxis_title = 'First Digit'
)
fig.update_xaxes(tickvals=list(sorted(pd.Series(y_train).unique())))
fig.show()

## Embed X_test

In [None]:
tfidf_test_vectors = Embedder.get_tfidf_embeddings(embedder, ATP_data_test_set['input'])

## Get first n digits of y_test

In [None]:
X_test = tfidf_test_vectors
y_test = np.array(ATP_data_test_set['code'].apply(
    OccupationPreprocessor.first_n_digits, args=(TARGET_CODE_LENGTH,))
)

# Build preliminary classifiers

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
import time

# Train TFIDF

In [None]:
SVM = SVC(class_weight='balanced', kernel='linear')

start = time.time()
SVM.fit(X_train, y_train)
print('SVM training duration: {} seconds'.format(time.time()-start))

In [None]:
RF = RandomForestClassifier(n_estimators=256, max_depth=128, n_jobs=-1, warm_start=True)

start = time.time()
RF.fit(X_train, y_train)
print('RF training duration: {} seconds'.format(time.time()-start))

In [None]:
KNN = KNeighborsClassifier(n_neighbors = 4, n_jobs=-1)

start = time.time()
KNN.fit(X_train, y_train)
print('KNN training duration: {} seconds'.format(time.time()-start))

In [None]:
# with open('classifiers')

# Predict TFIDF

In [None]:
start = time.time()
svm_pred = SVM.predict(X_test)
print('SVM prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

In [None]:
start = time.time()
rf_pred = RF.predict(X_test)
print('RF prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

In [None]:
start = time.time()
knn_pred = KNN.predict(X_test)
print('KNN prediction duration on {} samples: {} seconds'.format(X_test.shape[0], time.time()-start))

# Get Metrics

In [None]:
from sklearn.metrics import accuracy_score, f1_score
tfidf_test_df = pd.DataFrame({
    'svm_pred':svm_pred,
    'rf_pred':rf_pred,
    'knn_pred':knn_pred,
    'code':y_test
})

from sklearn.metrics import accuracy_score, f1_score

for classifier in ['knn','svm', 'rf']:
    print('{} acc:{}, f1-macro:{}'.format(classifier.upper(), 
                                    accuracy_score(
                                        tfidf_test_df['{}_pred'.format(classifier)], 
                                        y_test
                                    ),
                                    f1_score(
                                        tfidf_test_df['{}_pred'.format(classifier)],
                                        y_test, average = 'macro')
                                   )
     )

In [None]:
from collections import Counter
def ensemble_predict(row, predictor_cols, default_predictor):

    # find majority vote for all methods, :-1 drops ground truth column
    votes = Counter(row[predictor_cols]).most_common(1)
    
    # take svm as tie-breaker because CURRENTLY most accurate
    winning_class, highest_num_votes = votes[0]
    return winning_class
    return row[default_predictor] if highest_num_votes < 2 else winning_class

In [None]:
tfidf_test_df['p_all'] = tfidf_test_df.apply(ensemble_predict, axis = 1, args = (
    ['rf_pred','svm_pred','knn_pred'], 'svm_pred',
))

In [None]:
tfidf_test_df.iloc[:, :10][['p_all','code']]

In [None]:
print('Ensemble metrics total. Accuracy:{}, f1:{}\n'.format(accuracy_score(tfidf_test_df['p_all'], y_test), 
                                      f1_score(tfidf_test_df['p_all'], y_test, average = 'macro')))
for target in list(sorted(pd.Series(y_test).unique())):
    print("---- Target =", target)
    print('Accuracy:{:.2f}'.format(
            accuracy_score(
                tfidf_test_df['p_all'].loc[y_test == target], 
                pd.Series(y_test).loc[y_test == target]
            )
        )
    )
          

In [None]:
# Balancing the train data