## 1. Importing Libraries/Tools

In [50]:
import gensim
import multiprocessing
import nltk
import numpy as np
import pandas as pd
import re
import sklearn
import spacy
import pickle

from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from spacy.tokenizer import Tokenizer

## 2. Load Relevant Data

In [51]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# Tokenize words only with the whitespace rule
# N-grams will no longer be treated as 'N' and '-grams'
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

skills_api = pd.read_excel('all_skills_emsi.xlsx')
skills_api['name'] = skills_api['name'].apply(lambda x: re.sub("\W?\(.*?\)","",x))
skills_api['name'] = skills_api['name'].apply(lambda x: x.strip().lower())
skills_api['lemmatized_name'] = skills_api['name'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
skills_api = set(skills_api['name']).union(set(skills_api['lemmatized_name']))

df1 = pd.read_csv('Resume.csv')
# standardise column names
df1['Text'] = df1['Resume_str']
df1 = df1[['ID','Category','Text']]

df2 = pd.read_csv('Job Description.csv')
# get only english job descriptions
df2 = df2[df2['job_post_lang'].str.lower() == 'en-gb'] 
# standardise column names
df2['ID'] = df2['uniq_id']
df2['Category'] = df2['category']
df2['Text'] = df2['job_requirements']
df2 = df2[['ID','Category','Text']]
# drop empty rows
df2 = df2.dropna().reset_index(drop=True) 

In [52]:
df1["Category"].value_counts()

INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: Category, dtype: int64

In [53]:
df2["Category"].value_counts()

Registered Nurses                                    2513
Personal Care Aides                                   508
Social and Human Service Assistants                   415
Home Health Aides                                     393
Medical and Health Services Managers                  253
                                                     ... 
Health Technologists and Technicians, All Other         1
Electronics Engineering Technicians                     1
Production Workers, All Other                           1
Insurance Adjusters, Examiners, and Investigators       1
Computer Network Architects                             1
Name: Category, Length: 291, dtype: int64

## 3. Skills Extraction

Common Skills Extraction Functions

In [54]:
def preprocess(txt):
    txt = txt.lower()
    # these must come first
    txt = re.sub('b\.\S*', '', txt) # remove all bachelor qualifications
    txt = re.sub('m\.\S*', '', txt) # remove all master qualifications
    # then these
    txt = txt.replace("'","").replace("’","") # remove apostrophes
    txt = re.sub('<.*?>',' ',txt) # remove <> tags
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', '  ', txt)  # remove mentions
    txt = re.sub('[^a-zA-Z]', ' ', txt) # Remove non-English characters
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace

    # tokenize word
    txt = nlp(txt)

    # remove stop words
    txt = [token.text for token in txt if not token.is_stop]

    return ' '.join(txt)

def n_grams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def generate_list_of_skills(text):
    nlp_text = nlp(text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    # all the resume skills will be saved here
    skillset = []
        
    # check for one-grams (example: python)
    for token in tokens:
        skillset.append(token)
        
    # check for noun_chunks (example: machine learning)
    for token in nlp_text.noun_chunks:
        token = token.text.lower().strip()
        skillset.append(token)

    # check for N-grams that SpaCy missed in the noun_chuncks
    for n in range(2, 10):
        for token in n_grams(tokens, n):
            token = token.lower().strip()
            skillset.append(token)

    return skillset

### 3.1 User Skills Extraction

Functions

In [55]:
def extract_skills(resume_text, skills_api, clean=True):
    if clean == True:
        resume_text = preprocess(resume_text)
    
    skillset = set([i for i in set([i.lower() for i in generate_list_of_skills(resume_text)])])
    return skillset.intersection(skills_api)

def date_search(resume):
    ans = []

    # find all the date occurrence based on the regular expression
    pattern = r'(((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Nov(ember)?|Dec(ember)?)|(\d{1,2}\s?\/){0,2}|(\d{1,2}\s?\-){0,2})\s?[-/ ]?\s?\d{4}?)|\bPresent\b|\bpresent\b|\bCurrent\b|\bcurrent\b'
    res = list(re.finditer(pattern, resume))

    if len(res) > 1:
        for ele in res:
            # this is to eradicate the results of having only year but without month
            if len(ele.group().strip()) > 5:
                ans.append([ele.start(), ele.end(), ele.group().strip()])

    res = []
    # Convert "present" and "current" to today's date
    for ele in ans:
        if ele[2].lower() == 'present' or ele[2].lower() == 'current':
            today = pd.to_datetime('today').date()
            ele[2] = today
            res.append(ele)
        else:
            # catch DateParse Error here
            try:
                day = pd.to_datetime(ele[2]).date()
                ele[2] = day
                res.append(ele)
            except :
                print('Cannot parse the date: ', ele[2])

    
    # all the date results are given in the form of [datetime_start_index, datetime_end_index, datetime]
    return res

def experience_tagging(date_list):
    i = 1
    cleaned_section = {}

    while i < len(date_list):

        prev = date_list[i-1]
        cur = date_list[i]
        
        if cur[0] < prev[1] + 10:
            # Taking ceiling of the year of experience
            key = ((cur[2] - prev[2]).days // 365) + 1
            
            # section starts at the (end index of the current date) + 1
            frm = cur[1]+1

            if i < len(date_list) - 1:
                # if there is another date that appears later, then the section will be until the start index of the next date
                until = date_list[i+1][0]
            else:
                # else the section will be until the end of the resume
                until = -1
            

            # Multiple projects with same year of experience, we do 'chaining' here
            if key in cleaned_section:
                cleaned_section[key].append((frm, until))
            else:
                cleaned_section[key] = [(frm, until)]
            i += 2

        else:

            # ignore the current date, possibly it is useless
            i += 1

    return cleaned_section

def skills_experience_level_identification(resume,skills_api):
    res = {}
    date_list = date_search(resume)
    experience_sec = experience_tagging(date_list)

    for key in experience_sec:
        for start, end in experience_sec[key]:
            skills_list = extract_skills(resume[start:end], skills_api, True)
            for ele in skills_list:
                if ele not in res:
                    res[ele] = key
                else:
                    res[ele] = max(key, res[ele])

    skills_list = extract_skills(resume, skills_api, True)
    for ele in skills_list:
        if ele not in res:
            res[ele] = 1

    return dict(sorted(res.items(), key = lambda x:x[1], reverse = True))

Test Cases

In [56]:
# Test Case
test_user = df1.iloc[300,2]
test_user_skills = skills_experience_level_identification(test_user,skills_api)
print(df1.iloc[300,1])
test_user_skills

INFORMATION-TECHNOLOGY


{'system center configuration manager': 18,
 'disaster recovery': 18,
 'security policies': 18,
 'operations': 18,
 'reduction': 18,
 'security': 18,
 'business continuity': 18,
 'plan': 18,
 'active directory': 18,
 'r': 18,
 'reliability': 18,
 'application deployment': 18,
 'virus': 18,
 'netscaler': 18,
 'planning': 18,
 'coordinate': 18,
 'business technology': 18,
 'capacity planning': 18,
 'management': 18,
 'web servers': 3,
 'web development': 3,
 'group policy': 3,
 'network support': 3,
 'operating systems': 3,
 'ghost': 1,
 'microsoft certified technology specialist': 1,
 'technical support': 1,
 'laser printers': 1,
 'laser': 1,
 'troubleshooting': 1,
 'desktop environment': 1,
 'windows desktop': 1,
 'desktop support': 1,
 'citrix certified administrator': 1,
 'information systems': 1,
 'management information systems': 1,
 'project management': 1,
 'spyware': 1,
 'communication': 1,
 'adware': 1,
 'streamlining': 1,
 'color': 1,
 'adobe acrobat': 1,
 'information technol

In [14]:
# extract skills from all rows > store in dictionary > pickle dump
'''
df1["Skills"] = df1["Text"].apply(lambda x: skills_experience_level_identification(x,skills_api))
resume_skills0 = {}
for index, row in df1.iterrows():
  resume_skills0[row.ID] = row.Skills
with open('resume_skills.pkl', 'wb') as handle:
  pickle.dump(resume_skills0, handle)
'''

'\ndf1["Skills"] = df1["Text"].apply(lambda x: skills_experience_level_identification(x,skills_api))\nskills0 = {}\nfor index, row in df1.iterrows():\n  skills0[row.ID] = row.Skills\nwith open(\'user_skills.pkl\', \'wb\') as handle:\n  pickle.dump(skills0, handle)\n'

### 3.2 Required Skills Extraction

Common Functions

In [57]:
def find_significant_skills(agg_table):

    # gauge skill levels according to percentiles
    skills_required = {}
    for col in agg_table.columns:
        skills = agg_table[col][agg_table[col] != 0]

        # no skills extracted > skills required = empty dictionary
        if len(skills) == 0: 
            skills_required[col] = dict()
        
        # some skills extracted > skills required = some dictionary
        else:
            # skills extracted have same frequency > keep all skills
            if len(skills.unique()) == 1:
                series = skills
            # skills extracted have different frequency > keep skills above 5th percentile
            else:
                series = agg_table[col][agg_table[col] > np.percentile(skills, 95)]

            # skills above 5th percentile have same frequency > scale to 1 (max)
            if len(series.unique()) == 1:
                scaled_series = series.apply(lambda x: 1)
            # skills above 5th percentile have different frequency > scale to between 0 and 1
            else:
                scaled_series = series.apply(lambda x: (x - series.min()) / (series.max() - series.min()))

            # bin skills according to percentiles
            binned_series = scaled_series.apply(lambda x: 5 if x > 0.7 else 4 if x > 0.3 else 3)
            
            # convert series to dictionary form
            skills_required[col] = binned_series.to_dict()

    return skills_required

##### 3.2.1 From Resume

Functions

In [32]:
def get_resume_significance_table(df,skills_dic):
    
    # creates a binary matrix indicating the presence of each skill
    df['Skills'] = df['ID'].apply(lambda x: list(skills_dic[x].keys()))
    mlb = MultiLabelBinarizer()
    table = pd.DataFrame(mlb.fit_transform(df['Skills']),
                         columns=mlb.classes_,
                         index=df['Skills'].index)
    
    # add category column as y
    y = df['Category']
    table['y'] = y

    # sum by y column
    agg_table = table.groupby(['y']).sum()
    agg_table = agg_table.T

    return agg_table

Test Case

In [33]:
resume_df = df1.copy()

with open('resume_skills.pkl', 'rb') as pickle_file:
    resume_skills_dic = pickle.load(pickle_file)

resume_table = get_resume_significance_table(resume_df,resume_skills_dic)
resume_table

y,ACCOUNTANT,ADVOCATE,AGRICULTURE,APPAREL,ARTS,AUTOMOBILE,AVIATION,BANKING,BPO,BUSINESS-DEVELOPMENT,...,DIGITAL-MEDIA,ENGINEERING,FINANCE,FITNESS,HEALTHCARE,HR,INFORMATION-TECHNOLOGY,PUBLIC-RELATIONS,SALES,TEACHER
abaqus,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
ablation,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
ableton live,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
abnormal psychology,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
absorption,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoning,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
zoology,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoom,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
zumba,0,0,0,0,1,0,0,0,0,0,...,0,0,0,6,0,0,0,0,0,0


In [37]:
resume_skills_required = find_significant_skills(resume_table)

'''
with open('resume_skills_required.pkl', 'wb') as handle:
  pickle.dump(resume_skills_required, handle)
'''

resume_skills_required

{'ACCOUNTANT': {'audit': 4,
  'auditing': 3,
  'balance': 4,
  'balance sheet': 3,
  'bank reconciliations': 3,
  'banking': 3,
  'billing': 4,
  'bookkeeping': 3,
  'budget': 4,
  'budgeting': 3,
  'business administration': 4,
  'cash flow': 3,
  'close': 3,
  'closing': 4,
  'collections': 3,
  'communication': 4,
  'customer service': 3,
  'depreciation': 3,
  'detail oriented': 3,
  'filing': 3,
  'finance': 4,
  'financial analysis': 3,
  'financial statement': 3,
  'financial statements': 4,
  'forecasting': 3,
  'general ledger': 4,
  'invoice': 3,
  'invoicing': 3,
  'journal': 4,
  'leadership': 3,
  'ledger': 5,
  'level': 3,
  'management': 5,
  'microsoft office': 4,
  'operations': 4,
  'plan': 3,
  'planning': 3,
  'problem solving': 3,
  'purchase': 3,
  'quickbooks': 4,
  'r': 3,
  'reconciliation': 4,
  'research': 3,
  'sales': 4,
  'sales tax': 3,
  'spreadsheets': 3,
  'tax returns': 3,
  'time management': 3,
  'track': 3},
 'ADVOCATE': {'active listening': 3,
  '

##### 3.2.2 From Job Descriptions

Functions

In [58]:
def extract_skills_frequency(job_text, skills_api, clean=True):
  if clean == True:
      job_text = preprocess(job_text)

  # get all skills
  skillset = [i.lower() for i in generate_list_of_skills(job_text)]

  # get frequency of each skill
  skillset_count = {}
  for skill in skillset:
    if skill in skills_api:
      if skill in skillset_count:
        skillset_count[skill] = skillset_count[skill] + 1
      else:
        skillset_count[skill] = 1
  
  return dict(sorted(skillset_count.items(), key = lambda x:x[1], reverse = True))

def get_job_significance_table(df,skills_dic):
   
    # creates a matrix indicating the frequency of each skill
    df['Skills'] = df['ID'].apply(lambda x: skills_dic[x])
    v = DictVectorizer(sparse=False)
    table = pd.DataFrame(v.fit_transform(df['Skills']),
                         columns=v.feature_names_,
                         index=df['Skills'].index)
    
    # add category column as y
    y = df['Category']
    table['y'] = y

    # get mean frequency by y column
    agg_table = table.groupby(['y']).max()
    agg_table = agg_table.T

    return agg_table


Test Cases

In [59]:
test_job = df2.iloc[13,2]
test_job_skills = extract_skills_frequency(test_job,skills_api,True)
print(df2.iloc[13,1])
test_job_skills

Software Developers, Applications


{'management': 3,
 'c': 2,
 'security': 2,
 'software module': 2,
 'programming': 1,
 'linux': 1,
 'collaboration': 1,
 'software development': 1,
 'architectural patterns': 1,
 'configuration management': 1,
 'requirements management': 1,
 'cyber security': 1,
 'software engineering': 1,
 'technical design': 1,
 'software development methodologies': 1,
 'software configuration management': 1}

In [60]:
# extract skills from all rows > store in dictionary > pickle dump
'''
df2["Skills"] = df2["Text"].apply(lambda x: extract_skills_frequency(x,skills_api,True))
job_skills0 = {}
for index, row in df2.iterrows():
  job_skills0[row.ID] = row.Skills
with open('job_skills.pkl', 'wb') as handle:
  pickle.dump(job_skills0, handle)
'''

In [61]:
job_df = df2.copy()

with open('job_skills.pkl', 'rb') as pickle_file:
    job_skills_dic = pickle.load(pickle_file)

job_table = get_job_significance_table(job_df,job_skills_dic)
job_table

y,Accountants,Actuaries,Acute Care Nurses,"Administrative Law Judges, Adjudicators, and Hearing Officers",Administrative Services Managers,Advanced Practice Psychiatric Nurses,Advertising Sales Agents,Advertising and Promotions Managers,Aerospace Engineers,Anesthesiologists,...,Telemarketers,Training and Development Managers,Training and Development Specialists,Treasurers and Controllers,Tutors,Upholsterers,Veterinary Technologists and Technicians,"Vocational Education Teachers, Postsecondary",Waiters and Waitresses,Web Developers
abdomen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
absorption,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
academic standards,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
acoustic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
acoustics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yoga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zone,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoology,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
job_skills_required = find_significant_skills(job_table)

'''
with open('job_skills_required.pkl', 'wb') as handle:
  pickle.dump(job_skills_required, handle)
'''

job_skills_required

{'Accountants': {'balance': 3,
  'balance sheet': 3,
  'finance': 5,
  'management': 3},
 'Actuaries': {'actuarial exams': 5,
  'communication': 5,
  'covenant': 5,
  'investment': 5},
 'Acute Care Nurses': {'cardiology': 5, 'management': 5, 'physiology': 5},
 'Administrative Law Judges, Adjudicators, and Hearing Officers': {'advise': 5,
  'comment': 5,
  'management': 5,
  'triage': 5},
 'Administrative Services Managers': {'procurement': 5},
 'Advanced Practice Psychiatric Nurses': {'health crisis': 3,
  'mental health': 5,
  'nursing': 3},
 'Advertising Sales Agents': {'display advertising': 3, 'sales': 5},
 'Advertising and Promotions Managers': {'marketing': 5},
 'Aerospace Engineers': {'c': 5, 'management': 5},
 'Anesthesiologists': {'management': 5},
 'Architects, Except Landscape and Naval': {'hospitality': 3, 'planning': 5},
 'Architectural Drafters': {'construction': 5},
 'Architectural and Engineering Managers': {'management': 3, 'planning': 5},
 'Assemblers and Fabricators,

## 4. Skills Gap Identification

Skills Gap Functions

In [47]:
# Map years of experience to competency level
def user_level_deduction(years):
    if years <= 2:
        return 3
    elif years <= 5:
        return 4
    else:
        return 5

# return skills gap grouped by skills
def skills_gap_identification(skills, skills_required):
    diff = {}
    for key in skills_required:
        if key not in skills:
            diff[key] = [x for x in range(3, skills_required[key] + 1)]
        else:
            user_level = user_level_deduction(skills[key])
            if user_level < skills_required[key]:
                diff[key] = [x for x in range(user_level, skills_required[key] + 1)]
    return diff

### 4.1 Gaps between User and Other Candidates

In [48]:
with open('resume_skills.pkl', 'rb') as pickle_file:
    resume_skills = pickle.load(pickle_file)

with open('resume_skills_required.pkl', 'rb') as pickle_file:
    resume_skills_required = pickle.load(pickle_file)

skills_gap_cand = skills_gap_identification(resume_skills[31605080], resume_skills_required['AVIATION'])
skills_gap_cand

{'aircraft maintenance': [3],
 'ammunition': [3],
 'aviation': [4, 5],
 'b': [3],
 'budget': [3],
 'business administration': [3],
 'c': [3],
 'communication': [3, 4],
 'communications': [3],
 'control systems': [3],
 'coordinate': [3],
 'coordinating': [3],
 'critical thinking': [3],
 'data entry': [3],
 'electronic': [3],
 'electronics': [3],
 'filing': [3],
 'hydraulic': [3],
 'inventory control': [3],
 'leadership': [3, 4],
 'level': [3, 4],
 'license': [3],
 'm': [3],
 'machine': [3],
 'management': [4, 5],
 'marketing': [3],
 'mechanic': [3],
 'merchandise': [3],
 'microsoft office': [3],
 'operation': [3, 4],
 'operations': [3, 4],
 'paperwork': [3],
 'plan': [3],
 'planning': [3, 4],
 'presentations': [3],
 'process improvement': [3],
 'procurement': [3],
 'project management': [3],
 'purchase': [3],
 'purchasing': [3],
 'quality assurance': [3],
 'quality control': [3],
 'r': [3],
 'reduction': [3],
 'repair': [3, 4],
 'resource': [3],
 'safety training': [3],
 'sales': [3],
 

### 4.2 Gaps between User and Job Requirements

In [49]:
with open('job_skills.pkl', 'rb') as pickle_file:
    job_skills = pickle.load(pickle_file)

with open('job_skills_required.pkl', 'rb') as pickle_file:
    job_skills_required = pickle.load(pickle_file)

skills_gap_jobs = skills_gap_identification(resume_skills[31605080], job_skills_required['Web Developers'])
skills_gap_jobs

{'angular': [3],
 'c': [3, 4],
 'git': [3],
 'javascript': [3, 4, 5],
 'php': [3],
 'sql': [3]}

## 5. Pathway Generation

In [39]:
courses = pd.read_excel('Courses.xlsx')
courses = courses.fillna('')
courses['Description'] = courses['jobFamily'] + ' ' \
                         + courses['Marketing Name'] + ' ' \
                         + courses['courseName'] + ' ' \
                         + courses ['moduleName'] + ' ' \
                         + courses['courseDesc'] + ' ' \
                         + courses['Outcome Description'] +  ' ' \
                         + courses['competencyUnitDesc']
courses = courses[['productId', 'Marketing Name', 'Description', 'jobFamily', 'competencyLevel']]
courses['Description'] = courses['Description'].astype(str)
courses['Description'] = courses['Description'].apply(preprocess)

In [50]:
courses.jobFamily.value_counts()

Software Design & Development                 514
Infocomm Sales & Marketing                    195
Infrastructure Support                        105
Business Analytics                            105
Digital Advertising / Digital Distribution    100
Infrastructure Architecture                    37
Enterprise Mobility                            17
Project Management                             16
Strategy and Architecture                      15
Service Innovation Design                      14
Infocomm Security                              13
Sales and Marketing                            12
Cloud Computing                                 9
Data Centre Management                          9
Product Management                              7
IT Management                                   5
Enterprise Network Design Management            5
IT Outsourcing Management                       1
Generic Skills                                  1
Name: jobFamily, dtype: int64

### 5.1 Doc2Vec

In [40]:
def tagcol_paragraph_embeddings_features(train_data):

    # Expects a dataframe with a 'Description' column
    train_data_values = train_data['Description'].values
    
    # Remember to use token.text to get the raw string, otherwise doc2vec cannot build vocabulary
    columns = [TaggedDocument([token.text for token in nlp(text) if token is not token.is_stop] , [i]) for i, text in enumerate(train_data_values)]
    
    return columns

corpus = tagcol_paragraph_embeddings_features(courses)
model = Doc2Vec(dm=0, vector_size=50, workers=multiprocessing.cpu_count(), min_count=2, epochs=100, hs=1, negative=0)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

Test Case

In [47]:
def course_suggestion(doc2vec_model, skills_gap_cand, skills_gap_jobs):
    ans = {'job': {}, 'continuous learning': {}}

    for key in skills_gap_jobs:
        vector = model.infer_vector(skills_gap_jobs[key])
        res = model.dv.most_similar([vector], topn=10)
        course_unique = set()
        course_list = []
        for i, prob in res:
            if courses.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses.loc[i, 'Marketing Name'])
                course_list.append((courses.loc[i, 'productId'], courses.loc[i, 'Marketing Name'], courses.loc[i, 'competencyLevel']))
        ans['job'][key] = course_list

    for key in skills_gap_cand:
        vector = model.infer_vector(skills_gap_cand[key])
        res = model.dv.most_similar([vector], topn=10)
        course_unique = set()
        course_list = []
        for i, prob in res:
            if courses.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses.loc[i, 'Marketing Name'])
                course_list.append((courses.loc[i, 'productId'], courses.loc[i, 'Marketing Name'], courses.loc[i, 'competencyLevel']))
        ans['continuous learning'][key] = course_list

    return ans

In [None]:
course_suggestion(model, skills_gap_cand, skills_gap_jobs)

### 5.2 SpaCy

In [55]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')

for level in skills_gap_jobs:
  skills_gap_text = " ".join(skills_gap_jobs[level])

  # get courses of same competency level
  df = courses.copy()
  df = df[df["competencyLevel"].str.contains(str(level))]

  # get similarity score
  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
  top_courses = df.nlargest(5,'Similarity',keep='all')
  print("Level " + str(level) + ":")
  print("Skills to bridge: " + skills_gap_text)
  for index, row in top_courses.iterrows():
    print(row["productId"],
          row["Marketing Name"],
          round(row["Similarity"],2))
  print("\n")

  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))


Level 3:
Skills to bridge: angular c git javascript php sql
7316.0 Front End Web Development 0.85
1401.0 NICF - Advanced Certificate in Web Development 0.84
14302.0 New Hire and Redeployment – Full Stack Web Developer 0.84
11217.0 ACWD-CAT B 0921A 0.83
5001.0 Data Intelligence 0.83


Level 4:
Skills to bridge: c javascript
6905.0 Full Stack Web Developer Bootcamp - International 0.65
7701.0 Full Stack Web Development Bootcamp 0.65
1746.0 NICF - Java EE Frameworks 0.65
5906.0 Full Stack Web Developer Bootcamp (Sri Lanka) 0.65
9601.0 Applied Master in Analytics & Artificial Intelligence 0.65


Level 5:
Skills to bridge: javascript
203.0 Private Cloud 0.52
12502.0 Cyber Security Management Capstone Project 0.5
3314.0 Express ERP SAP SD 0.48
1501.0 Digital Recruiter 0.48
6606.0 Express ERP SAP Sales and Distribution 0.48


