# Mount Files from Drive

In [1]:
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# if NOT working in colab
data_dir = './data'

# if working in colab
# data_dir = './'

In [3]:
df_skill_type = pd.read_csv(os.path.join(data_dir, 'NOC_skilltype.csv'))
df_major_group = pd.read_csv(os.path.join(data_dir, './NOC_majorgroup.csv'))
df_minor_group = pd.read_csv(os.path.join(data_dir, './NOC_minorgroup.csv'))
df = pd.read_csv(os.path.join(data_dir, './noc_data_get_byws_dealing_slash.csv'))

In [6]:
df.sample(5)

Unnamed: 0,Noc_code,job_title,group_title,lead_statement,main_duties,Emp_req,Exclusion
423,8612,bulb planter-landscaping; campground maintenan...,Landscaping and grounds maintenance labourers;,Landscaping and grounds maintenance labourers ...,Landscaping and grounds maintenance labourers ...,Some secondary school education may be require...,Landscape and horticulture technicians and spe...
109,2131,airport engineer; appraisal engineer; architec...,Civil engineers;,"Civil engineers plan, design, develop and mana...",Civil engineers perform some or all of the fol...,A bachelor's degree in civil engineering or in...,Chemical environmental engineers (in 2134 Che...
149,2254,engineering survey technician; engineering sur...,Land survey technologists and technicians;,Land survey technologists and technicians cond...,This group performs some or all of the followi...,Completion of secondary school is usually requ...,Drafting technologists and technicians (2253);...
448,9414,abrasive mixer-stone products; abrasive produc...,"Concrete, clay and stone forming operators;",This unit group includes workers who cast and ...,The following is a summary of the main duties ...,Some secondary school education may be require...,Concrete finishers (7282);Helpers and labourer...
479,9524,air conditioner coil assembler; air conditioni...,"Assemblers and inspectors, electrical applianc...",Assemblers in this unit group assemble prefabr...,The following is a summary of the main duties ...,Some secondary school education is usually req...,Appliance servicers and repairers (7332);Assem...


In [30]:
# pad missing digits from noc codes
df['Noc_code'] = df['Noc_code'].apply(lambda x: '{0:0>4}'.format(x))

In [31]:
# key is abbreviation, value is expanded occupation
abbreviations_map = {}
STRIP_ABBREVIATIONS = True


def handle_single_quotes(text):
    """
    handle plurals, which are the main use of the single quote. Afterwards, drop all other single quotes
    """
    text = text.replace("s'", '').replace("'s", '')
    return text.replace("'", '')

def handle_parentheses(text, strip_abbrev):
    """
    Parentheses seem to fall into two general cases in the VAST majority of instances:
    1. Indicates an abbreviation
    2. Indicates an exception, by using keywords such as "except" or "non"
    """
    parentheses_idx = 0
    split = text.split("(")
    for i, substr in enumerate(split):
        if ')' in substr:
            parentheses_idx = i
            break
    
    # fragment before the fragment with the paren.
    str1 = split[parentheses_idx-1].strip()
    assert not ')' in str1
    
    # fragment w parenthesis
    str2 = split[parentheses_idx].split(")")[0].strip()
    
    if 'except' in str2 or 'non' in str2:
        text = text.replace(str2, '')
        # TODO, do something with exceptions
        
    else:
        # take the shorter string as the abbreviation
        ab, ex = (str1, str2) if len(str1) < len(str2) else (str2, str1)
    
        # save abbreviation
        abbreviations_map[ab] = ex
        
        # remove the found abbreviation from job title
        if strip_abbrev:
            text = text.replace(ab, '')

    # remove parentheses, leading and trailing whitespace 
    text = text.replace('(','').replace(')','').strip()
    
    return text

def preprocess_text(text, strip_abbrev=False):
    
    # handle slashes
    text = text.replace("/", ' ')
    
    # remove redundant semi-colons
    text = text.strip(';')
    
    # hyphens are semantic noise, remove
    text = text.replace('-', ' ')
    
    # handle '
    if "'" in text:
        text = handle_single_quotes(text)
    
    # handle ,
    text = text.replace(",", '')
    
    # handle .
    text = text.replace(".", '')
    
    # handle parentheses, only one check necessary since we already verified they are all paired with corresponding ')'
    if "(" in text:
        text = handle_parentheses(text, strip_abbrev=strip_abbrev)
    
    # remove leading and trailing whitespace
    text = text.strip()
    
    # normalize case
    return text.lower()

In [32]:
all_job_samples = {}

def extract_job_samples(row):
    NOC_code = int(row['Noc_code'])
    
    # split jobs contained in row by ';' and .replace('-', '; ') is for '-', .replace('-', '; ')
    # REVISE WHETHER TO KEEP - separation. logic is that lieutenant-governor can be described as lieutenant governer, no hyphen
    # make unique set
    # strip extra characters 
    # and take nonempty elements
    jobs = [
        j for j in  row['job_title'].split(';')
        if (j != '' and j != ' ')
    ]
    
    # change gendered entries such as 'chairman/woman' into separate samples, 'chairman', 'chairwoman'
    for idx, job in enumerate(jobs):
        if 'man/woman' in job:
            # change original entry to 'job(man)', then append job(woman) to end of list
            jobs[idx] = job.replace('man/woman', 'man')
            jobs.append(job.replace('man/woman', 'woman'))
        if 'men/women' in job:
            jobs[idx] = job.replace('men/women', 'men')
            jobs.append(job.replace('men/women', 'women'))
        if 'boy/girl' in job:
            jobs[idx] = job.replace('boy/girl', 'boy')
            jobs.append(job.replace('boy/girl', 'girl'))
        if 'master/mistress' in job:
            jobs[idx] = job.replace('master/mistress', 'master')
            jobs.append(job.replace('master/mistress', 'mistress'))
        if 'host/hostess' in job:
            jobs[idx] = job.replace('host/hostess', 'host')
            jobs.append(job.replace('host/hostess', 'hostess'))
        if 'waiter/waitress' in job:
            jobs[idx] = job.replace('waiter/waitress', 'waiter')
            jobs.append(job.replace('waiter/waitress', 'waitress'))
            
    # remove duplicate entries
    jobs = set(jobs)
    
    # parse counts of each job
    row['n_sample_jobs'] = len(jobs)
    
    # iterate through job and add to dictionary
    for j in jobs:
        
        if j not in all_job_samples:
            all_job_samples[j] = NOC_code

        # safe check, if job appears more than once, clause will print the both NOC Codes
        else:
            if all_job_samples[j] != NOC_code:
                print(j, 'repeated', all_job_samples[j], NOC_code)
    
    return row

def parse_1(row):
    # get info from first digit of 4 digit code
    row['1_digit_target'] = int(str(row['Noc_code'])[0])
    row['1_digit_group'] = df_skill_type[df_skill_type['skilltype_code'] == row['1_digit_target']]['skilltype_title']
        
    return row

def parse_2(row):
    # get info from first 2 digits of 4 digit code
    
    # check if NOC code is long enough for parsing
    if len(str(row['Noc_code'])) > 1:
        row['2_digit_target'] = int(str(row['Noc_code'])[:2])
        row['2_digit_group'] = df_major_group[df_major_group['majorgroup_code'] == '\'' + str(row['2_digit_target'])]['majorgroup_title']
        
    else:
        row['2_digit_target'] = 'NA'
        row['2_digit_group'] = 'NA'
    
    return row

def parse_3(row):
    # get info from first 3 digits of 4 digit code
    
    # check if NOC code is long enough for parsing
    if len(str(row['Noc_code'])) > 2:
        row['3_digit_target'] = int(str(row['Noc_code'])[:3])
        row['3_digit_group'] = df_minor_group[df_minor_group['minorgroup_code'] == '\'' + str(row['3_digit_target'])]['minorgroup_title']
        
    else:
        row['3_digit_target'] = 'NA'
        row['3_digit_group'] = 'NA'
        
    return row

In [33]:
# Do once, if 'noc_code' column already dropped, except to skip action
try:
    df = df.apply(parse_1, axis = 1)
    df = df.apply(parse_2, axis = 1)
    df = df.apply(parse_3, axis = 1)
    df = df.apply(extract_job_samples, axis = 1)
except KeyError:
    pass

len(all_job_samples.keys())

29741

# Graphs, skippable

In [None]:
px.histogram(x = df['n_sample_jobs'], nbins = 200)

In [35]:
import plotly.express as px
sunburst_df = pd.DataFrame()
sunburst_df['4'] = df['Noc_code'].astype(int).astype(str)
sunburst_df['counts'] = df['n_sample_jobs'].astype(int).astype(str)
sunburst_df['3'] = (df['Noc_code'].astype(int)//10).astype(str)
sunburst_df['2'] = (df['Noc_code'].astype(int)//100).astype(str)
sunburst_df['1'] = (df['Noc_code'].astype(int)//1000).astype(str)
fig = px.sunburst(sunburst_df, path=['1', '2', '3', '4'], values='counts')
fig.update_layout(
    title="Visualization of the NOC Hierarchicy by Level and Number of Samples in Each Group")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y = df['n_sample_jobs'],
    marker = {
        'color':  df['n_sample_jobs']*-1
    }
))

fig.update_layout(
    title='Counts per NOC Code',
    xaxis_title="NOC Code",
    yaxis_title="Count",
    paper_bgcolor='rgb(255, 255, 255)',
    plot_bgcolor='rgb(255, 255, 255)',
    font=dict(
        family= 'Times New Roman',
        size=14,
        color="black"
    ),
    xaxis_showgrid=True,
    yaxis_showgrid=True
)
fig.update_xaxes(showline=True, linewidth=.5, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
fig.show()

In [None]:
all_descriptions = {}
desc_counts = []
def unpack_descriptions(row):
    # unpack all descriptions from a row and
    duty = row['main_duties']
    desc_counts.append(0)
    
    # split duty field into separate duties and remove initial generic blurb
    for description in duty.strip('-').split(';'):
        if 'duties' not in description:
            all_descriptions[description] = row['Noc_code']
            desc_counts[-1] += 1
            
    return row

df.apply(unpack_descriptions, axis = 1)

In [None]:
inputs = list(all_job_samples.keys()) + list(all_descriptions.keys())
outputs = list(all_job_samples.values()) + list(all_descriptions.values())

In [None]:
assert len(inputs) == len(outputs)

# For making the doc2vec corpus

In [None]:
from collections import Counter
raw_words = ' '.join(inputs).split()
raw_word_count = Counter(raw_words)

# unique words only
input_words = [*{*raw_words}]

In [None]:
len(all_job_samples)

In [None]:
len(all_descriptions)

In [None]:
train_df = pd.DataFrame(dict(all_job_samples).items(), columns = ['input', 'code'])

In [None]:
train_df

# Distribution of all output classes

In [None]:
sum(train_df['code'].value_counts())/1000
# px.bar(x = train_df['code'].value_counts().index, y = train_df['code'].value_counts())

In [None]:
px.histogram(train_df['code'].value_counts())

# Try tfidf

In [None]:
def first_n_digits(string, n=4):
    
    # if default number of digits desired, don't do anything
    if n == 4:
        return string
    
    # else pad left with zeros until 4 digits reached
    padded_str = '{0:0>4}'.format(string)
    return padded_str[:n]

## Train test split

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load ATP data for some train noise 

In [None]:
ATP_data = pd.DataFrame(pd.read_excel('./Data/V5_Run Input(1).xlsx'))
ATP_data['code'] = ATP_data['NOC code '].apply(lambda x: int(x.strip('\''))).apply(first_n_digits, args=(4,))
ATP_data.drop(columns = ['NOC code '], inplace = True)

ATP_data['input'] = ATP_data['Current Job Title']
ATP_data.drop(columns = ['Current Job Title'], inplace = True)

In [None]:
# Shuffle your dataset 
shuffled_ATP_df = ATP_data.sample(frac=1, random_state=42)

# Define a size for your train set 
ATP_train_size = 8000

# Split your dataset 
ATP_data_train_set = shuffled_ATP_df[:ATP_train_size]
ATP_data_test_set = shuffled_ATP_df[ATP_train_size:]


# Combine both train sets

In [None]:
train_df = train_df[['code','input']]
ATP_data_train_set = ATP_data_train_set[['code','input']]
ATP_data_test_set = ATP_data_test_set[['code','input']]

train_df = train_df.append(ATP_data_train_set)

# Preprocess ALL the things

In [None]:
STRIP_ABBREVIATIONS = True

In [None]:
train_df['input'] = train_df['input'].apply(preprocess_text, args = (STRIP_ABBREVIATIONS,))
ATP_data_test_set['input'] = ATP_data_test_set['input'].apply(preprocess_text, args = (STRIP_ABBREVIATIONS,))

In [None]:
corpus = list(train_df['input'])

In [None]:
corpus[:20]

# distribution of first digit and all 4 digits

In [None]:
fig = px.histogram(x = y1.value_counts().index, y = y1.value_counts(), nbins=10)
fig.show()
fig = px.histogram(x = y4.value_counts().index, y = y4.value_counts(), nbins=80)
fig.show()

# vectorize train data

In [None]:
# for effient load an dstore of objects w/ large numpy arrays internally
from joblib import dump, load

# Remove highly uncommon word (freq < 5) from corpus to reduce dimensionality
vectorizer = TfidfVectorizer(min_df=5, 
                             stop_words="english",
                            lowercase=True)
vectorized_X_train = vectorizer.fit_transform(corpus)
vectorized_X_train.shape

dump(vectorizer, 'vectorizer.joblib') 

# Make sure vectorizer.joblib saved

In [None]:
!ls *joblib

In [None]:
list(vectorizer.get_feature_names()[:10])

In [None]:
vectorized_X_train.shape

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
train_df.head(5)

## Train the classifiers

In [None]:
import time

In [None]:
SVM = SVC(class_weight='balanced', kernel='linear')

start = time.time()
SVM.fit(vectorized_X_train, train_df['code'])
print('SVM training duration: {} seconds'.format(time.time()-start))

In [None]:
RF = RandomForestClassifier(n_estimators=512, max_depth=256, n_jobs=-1, warm_start=True)

start = time.time()
RF.fit(vectorized_X_train, train_df['code'])
print('RF training duration: {} seconds'.format(time.time()-start))

In [None]:
KNN = KNeighborsClassifier(n_neighbors = 5, n_jobs=-1)

start = time.time()
KNN.fit(vectorized_X_train, train_df['code'])
print('KNN training duration: {} seconds'.format(time.time()-start))

# Transform test data

In [None]:
(ATP_data_test_set['input'])

# Predict and time for each classifier

In [None]:
start = time.time()
svm_pred = SVM.predict(vectorized_X_test)
print('SVM prediction duration on {} samples: {} seconds'.format(vectorized_X_test.shape[0], time.time()-start))

In [None]:
start = time.time()
rf_pred = RF.predict(vectorized_X_test)
print('RF prediction duration on {} samples: {} seconds'.format(vectorized_X_test.shape[0], time.time()-start))

In [None]:
start = time.time()
knn_pred = KNN.predict(vectorized_X_test)
print('KNN prediction duration on {} samples: {} seconds'.format(vectorized_X_test.shape[0], time.time()-start))

# try closest k-nn for votes, REQUIRES SAVING THE y_train vector

In [None]:
start = time.time()
closestknn = KNN.kneighbors(vectorized_X_test, 5, return_distance=False)
print('KNN 5 closest neighbors duration on {} samples: {} seconds'.format(vectorized_X_test.shape[0], time.time()-start))

# Without data leakage

In [None]:
from sklearn.metrics import accuracy_score, f1_score
print('SVM acc:{}, f1:{}'.format(accuracy_score(svm_pred, ATP_data_test_set['code']), f1_score(svm_pred, ATP_data_test_set['code'], average = 'macro')))
print('RF acc:{}, f1:{}'.format(accuracy_score(rf_pred, ATP_data_test_set['code']), f1_score(rf_pred, ATP_data_test_set['code'], average = 'macro')))
print('KNN acc:{}, f1:{}'.format(accuracy_score(knn_pred, ATP_data_test_set['code']), f1_score(knn_pred, ATP_data_test_set['code'], average = 'macro')))

pred_df = pd.DataFrame({
    'p_svm':svm_pred,
#     'p_lr':lr_pred,
    'p_rf':rf_pred,
    'p_knn':knn_pred,
#     'p_knn_1':list(y_train.iloc[closestknn[:, 0]]),
#     'p_knn_2':list(y_train.iloc[closestknn[:, 1]]),
    'target':ATP_data_test_set['code']
})

display(pred_df.iloc[:20])

In [None]:
def ensemble_predict(row):

    # find majority vote for all methods, :-1 drops ground truth column
    votes = Counter(row[:-1]).most_common(1)
    
    # take svm as tie-breaker because CURRENTLY most accurate
    winning_class, highest_num_votes = votes[0]
    return winning_class
    return row['p_rf'] if highest_num_votes < 2 else winning_class


In [None]:
p_all = pred_df.drop(columns=['target']).apply(ensemble_predict, axis = 1)
pred_df['p_all'] = p_all

In [None]:
print('Ensemble acc:{}, f1:{}'.format(accuracy_score(pred_df['p_all'], ATP_data_test_set['code']), 
                                      f1_score(pred_df['p_all'], ATP_data_test_set['code'], average = 'macro')))
display(pred_df.iloc[:20][['p_all','target']])

In [None]:
pred_df