## 1. Importing Libraries/Tools

In [17]:
import gensim
import multiprocessing
import nltk
import numpy as np
import pandas as pd
import re
import sklearn
import spacy
import pickle

from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import MultiLabelBinarizer
from spacy.tokenizer import Tokenizer

## 2. Load Relevant Data

In [59]:
skills_api = pd.read_excel('/content/drive/MyDrive/BT4103/all_skills_emsi.xlsx')
skills_api['name'] = skills_api['name'].apply(lambda x: re.sub("\W?\(.*?\)","",x))
skills_api['name'] = skills_api['name'].apply(lambda x: x.strip().lower())
skills_api = set(skills_api['name'])

df1 = pd.read_csv('Resume.csv')
df1 = df1[['ID','Category','Resume_str']]

df2 = pd.read_csv('Job Description.csv')
df2 = df2[df2["job_post_lang"].str.lower() == "en-gb"]
df2 = df2[['uniq_id','category','job_requirements']]
df2 = df2.dropna().reset_index(drop=True)

In [60]:
df1["Category"].value_counts()

INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: Category, dtype: int64

In [61]:
df2["category"].value_counts()

Registered Nurses                                    2513
Personal Care Aides                                   508
Social and Human Service Assistants                   415
Home Health Aides                                     393
Medical and Health Services Managers                  253
                                                     ... 
Health Technologists and Technicians, All Other         1
Electronics Engineering Technicians                     1
Production Workers, All Other                           1
Insurance Adjusters, Examiners, and Investigators       1
Computer Network Architects                             1
Name: category, Length: 291, dtype: int64

## 3. Skills Extraction

Common Skills Extraction Functions

In [18]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# Tokenize words only with the whitespace rule
# N-grams will no longer be treated as 'N' and '-grams'
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

def preprocess(txt):
    txt = txt.lower()
    # these must come first
    txt = re.sub('b\.\S*', '', txt) # remove all bachelor qualifications
    txt = re.sub('m\.\S*', '', txt) # remove all master qualifications
    
    txt = txt.replace("'","").replace("’","") # remove apostrophes
    txt = re.sub("<.*?>"," ",txt) # remove <> tags
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', '  ', txt)  # remove mentions
    txt = re.sub('[^a-zA-Z]', ' ', txt) # Remove non-English characters
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace

    # tokenize word
    txt = nlp(txt)

    # remove stop words and lemmatization
    txt = [token.lemma_ for token in txt if not token.is_stop]

    return ' '.join(txt)

def n_grams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]


def generate_list_of_skills(text):
    nlp_text = nlp(text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    # all the resume skills will be saved here
    skillset = []
        
    # check for one-grams (example: python)
    for token in tokens:
        if nlp(token)[0].tag_ != 'VBN':
            skillset.append(token)
        
    # check for noun_chunks (example: machine learning)
    for token in nlp_text.noun_chunks:
        token = token.text.lower().strip()
        skillset.append(token)

    # check for bigrams that SpaCy missed in the noun_chuncks
    for token in n_grams(tokens, 2):
        token = token.lower().strip()
        skillset.append(token)

    # check for trigrams that SpaCy missed in the noun_chuncks
    for token in n_grams(tokens, 3):
        token = token.lower().strip()
        skillset.append(token)

    return skillset

### 3.1 Skills Extraction from Resume

Skills Extraction from Resume Functions

In [19]:
def extract_skills(resume_text, skills_api, clean=True):
    if clean == True:
        resume_text = preprocess(resume_text)
    
    skillset = set([i for i in set([i.lower() for i in generate_list_of_skills(resume_text)])])
    return skillset.intersection(skills_api)

def date_search(resume):
    ans = []

    # find all the date occurrence based on the regular expression
    pattern = r'(((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Nov(ember)?|Dec(ember)?)|(\d{1,2}\s?\/){0,2}|(\d{1,2}\s?\-){0,2})\s?[-/ ]?\s?\d{4}?)|\bPresent\b|\bpresent\b|\bCurrent\b|\bcurrent\b'
    res = list(re.finditer(pattern, resume))

    if len(res) > 1:
        for ele in res:
            # this is to eradicate the results of having only year but without month
            if len(ele.group().strip()) > 5:
                ans.append([ele.start(), ele.end(), ele.group().strip()])

    res = []
    # Convert "present" and "current" to today's date
    for ele in ans:
        if ele[2].lower() == 'present' or ele[2].lower() == 'current':
            today = pd.to_datetime('today').date()
            ele[2] = today
            res.append(ele)
        else:
            # catch DateParse Error here
            try:
                day = pd.to_datetime(ele[2]).date()
                ele[2] = day
                res.append(ele)
            except :
                print('Cannot parse the date: ', ele[2])

    
    # all the date results are given in the form of [datetime_start_index, datetime_end_index, datetime]
    return res

def experience_tagging(date_list):
    i = 1
    cleaned_section = {}

    while i < len(date_list):

        prev = date_list[i-1]
        cur = date_list[i]
        
        if cur[0] < prev[1] + 10:
            # Taking ceiling of the year of experience
            key = ((cur[2] - prev[2]).days // 365) + 1
            
            # section starts at the (end index of the current date) + 1
            frm = cur[1]+1

            if i < len(date_list) - 1:
                # if there is another date that appears later, then the section will be until the start index of the next date
                until = date_list[i+1][0]
            else:
                # else the section will be until the end of the resume
                until = -1
            

            # Multiple projects with same year of experience, we do 'chaining' here
            if key in cleaned_section:
                cleaned_section[key].append((frm, until))
            else:
                cleaned_section[key] = [(frm, until)]
            i += 2

        else:

            # ignore the current date, possibly it is useless
            i += 1

    return cleaned_section

def skills_experience_level_identification(resume):
    res = {}
    date_list = date_search(resume)
    experience_sec = experience_tagging(date_list)

    for key in experience_sec:
        for start, end in experience_sec[key]:
            skills_list = extract_skills(resume[start:end], skills_api, True)
            for ele in skills_list:
                if ele not in res:
                    res[ele] = key
                else:
                    res[ele] = max(key, res[ele])

    skills_list = extract_skills(resume, skills_api, True)
    for ele in skills_list:
        if ele not in res:
            res[ele] = 1

    return dict(sorted(res.items(), key = lambda x:x[1], reverse = True))

Test Case

In [25]:
# Test Case
test_resume = df1.iloc[300,2]
test_resume_skills = skills_experience_level_identification(test_resume)
print(df1.iloc[300,1])
test_resume_skills

INFORMATION-TECHNOLOGY


{'application deployment': 18,
 'disaster recovery': 18,
 'firewall': 18,
 'business continuity': 18,
 'r': 18,
 'netscaler': 18,
 'application development': 18,
 'planning': 18,
 'research': 18,
 'reduction': 18,
 'management': 18,
 'active directory': 18,
 'reliability': 18,
 'group policy': 3,
 'web development': 3,
 'system administration': 3,
 'network support': 3,
 'troubleshooting': 1,
 'desktop support': 1,
 'technical support': 1,
 'installation': 1,
 'project management': 1,
 'ghost': 1,
 'information technology': 1,
 'adobe acrobat': 1,
 'adware': 1,
 'spyware': 1,
 'b': 1}

In [None]:
# extract skills from all rows > store in dictionary > pickle dump
'''
df1["skills"] = df1["Resume_str"].apply(lambda x: skills_experience_level_identification(x))
resume_skills0 = {}
for index, row in df1.iterrows():
  resume_skills0[row.ID] = row.skills
with open('resume_skills.pkl', 'wb') as handle:
  pickle.dump(resume_skills0, handle)
'''

### 3.2 Skills Extraction from Job Description

Skills Extraction from Job Description Function

In [21]:
def extract_job_skills(job_text, skills_api, clean=True):
  if clean == True:
      job_text = preprocess(job_text)

  # get all skills
  skillset = [i.lower() for i in generate_list_of_skills(job_text)]

  # get frequency of each skill
  skillset_count = {}
  for skill in skillset:
    if skill in skills_api:
      if skill in skillset_count:
        skillset_count[skill] = skillset_count[skill] + 1
      else:
        skillset_count[skill] = 1
  
  return dict(sorted(skillset_count.items(), key = lambda x:x[1], reverse = True))

Test Case

In [24]:
test_job = df2.iloc[13,2]
test_job_skills = extract_job_skills(test_job,skills_api,True)
print(df2.iloc[13,1])
test_job_skills

Software Developers, Applications


{'management': 3,
 'c': 2,
 'programming': 1,
 'linux': 1,
 'collaboration': 1,
 'software development': 1,
 'configuration management': 1,
 'cyber security': 1,
 'product management': 1,
 'software engineering': 1,
 'technical design': 1,
 'software configuration management': 1,
 'software product management': 1,
 'software engineering process': 1}

## 4. Skills Gap Identification

Skills Gap Functions

In [26]:
# skills gap are grouped by skills
def skills_gap_identification(skills, skills_required):
    diff = {}
    for key in skills_required:
        if key not in skills:
            diff[key] = [x for x in range(3, skills_required[key] + 1)]
        elif skills[key] < skills_required[key]:
            diff[key] = [x for x in range(skills[key], skills_required[key] + 1)]
    return diff

# group skills gap by level
def skills_gap_by_level(skills_gap):
    new_skills_gap = {}
    for skill in skills_gap:
        for level in skills_gap[skill]:
            if level in new_skills_gap:
                new_skills_gap[level].append(skill)
            else:
                new_skills_gap[level] = [skill]
    return new_skills_gap

### 4.1 Gaps among Candidates

In [27]:
# get frequency table

with open('resume_skills.pkl', 'rb') as pickle_file:
    content = pickle.load(pickle_file)

data = df1.copy()
data['skills'] = data['ID'].apply(lambda x: list(content[x].keys()))

mlb = MultiLabelBinarizer()

skills_freq = pd.DataFrame(mlb.fit_transform(data['skills']),
                           columns=mlb.classes_,
                           index=data['skills'].index)

y = data['Category']
skills_freq['y'] = y
skills_freq 

Unnamed: 0,abaqus,ablation,ableton live,abnormal psychology,absorption,ac motors,academic advising,academic english,academic integrity,academic writing,...,zbrush,zemax,zendesk,zeta potential,zoning,zoology,zoom,zumba,zumba fitness,y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,HR
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,HR
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,HR
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,HR
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,HR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AVIATION
2480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AVIATION
2481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AVIATION
2482,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AVIATION


In [28]:
# sum frequency by categories

agg_data = skills_freq.groupby(['y']).sum()
agg_data = agg_data.T
agg_data

y,ACCOUNTANT,ADVOCATE,AGRICULTURE,APPAREL,ARTS,AUTOMOBILE,AVIATION,BANKING,BPO,BUSINESS-DEVELOPMENT,...,DIGITAL-MEDIA,ENGINEERING,FINANCE,FITNESS,HEALTHCARE,HR,INFORMATION-TECHNOLOGY,PUBLIC-RELATIONS,SALES,TEACHER
abaqus,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
ablation,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
ableton live,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
abnormal psychology,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
absorption,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoning,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoology,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoom,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
zumba,0,0,0,0,1,0,0,0,0,0,...,0,0,0,6,0,0,0,0,0,0


In [29]:
# gauge skill levels according to percentiles on frequency

resume_skills_required = {}
for col in agg_data.columns:
    print(col)
    series = agg_data[col][agg_data[col] > np.percentile(agg_data[col][agg_data[col] != 0], 95)]
    scaled_series = series.apply(lambda x: (x - series.min()) / (series.max() - series.min()))
    binned_series = scaled_series.apply(lambda x: 5 if x > 0.7 else 4 if x > 0.3 else 3)
    resume_skills_required[col] = binned_series.to_dict()

'''
with open('resume_skills_required.pkl', 'wb') as handle:
  pickle.dump(job_skills_required, handle)
'''

ACCOUNTANT
ADVOCATE
AGRICULTURE
APPAREL
ARTS
AUTOMOBILE
AVIATION
BANKING
BPO
BUSINESS-DEVELOPMENT
CHEF
CONSTRUCTION
CONSULTANT
DESIGNER
DIGITAL-MEDIA
ENGINEERING
FINANCE
FITNESS
HEALTHCARE
HR
INFORMATION-TECHNOLOGY
PUBLIC-RELATIONS
SALES
TEACHER


"\nwith open('resume_skills_required.pkl', 'wb') as handle:\n  pickle.dump(job_skills_required, handle)\n"

In [30]:
with open('resume_skills.pkl', 'rb') as pickle_file:
    resume_skills = pickle.load(pickle_file)

'''
with open('resume_skills_required.pkl', 'rb') as pickle_file:
    resume_skills_required = pickle.load(pickle_file)
'''

skills_gap_cand = skills_gap_identification(resume_skills[31605080], resume_skills_required['AVIATION'])
skills_gap_cand

{'aircraft maintenance': [3],
 'b': [3],
 'business administration': [3],
 'c': [3],
 'construction': [3],
 'critical thinking': [3],
 'drawing': [3],
 'filing': [3],
 'inquiry': [3],
 'inventory control': [3],
 'investigation': [3],
 'leadership': [3, 4],
 'm': [3],
 'marketing': [3],
 'microsoft office': [3],
 'microsoft word': [3],
 'negotiation': [3],
 'planning': [3, 4],
 'process improvement': [3],
 'procurement': [3],
 'project management': [3],
 'purchasing': [3],
 'quality assurance': [3],
 'quality control': [3],
 'r': [3],
 'reduction': [3],
 'requisition': [3],
 'safety training': [3],
 'scheduling': [3],
 'security clearance': [3],
 'source': [3],
 'supervision': [3],
 'test equipment': [3],
 'time management': [3],
 'track': [3, 4],
 'tracking': [3]}

In [31]:
skills_gap_cand = skills_gap_by_level(skills_gap_cand)
skills_gap_cand

{3: ['aircraft maintenance',
  'b',
  'business administration',
  'c',
  'construction',
  'critical thinking',
  'drawing',
  'filing',
  'inquiry',
  'inventory control',
  'investigation',
  'leadership',
  'm',
  'marketing',
  'microsoft office',
  'microsoft word',
  'negotiation',
  'planning',
  'process improvement',
  'procurement',
  'project management',
  'purchasing',
  'quality assurance',
  'quality control',
  'r',
  'reduction',
  'requisition',
  'safety training',
  'scheduling',
  'security clearance',
  'source',
  'supervision',
  'test equipment',
  'time management',
  'track',
  'tracking'],
 4: ['leadership', 'planning', 'track']}

### 4.2 Gaps between Resume and Job Description

In [33]:
# get frequency table

with open('job_skills.pkl', 'rb') as pickle_file:
    content = pickle.load(pickle_file)

data = df2.copy()
data['skills'] = data['uniq_id'].apply(lambda x: list(content[x].keys()))

mlb = MultiLabelBinarizer()

skills_freq = pd.DataFrame(mlb.fit_transform(data['skills']),
                           columns=mlb.classes_,
                           index=data['skills'].index)

y = data['category']
skills_freq ['y'] = y
skills_freq 

Unnamed: 0,abdomen,absorption,acting,action research,active directory,active listening,acupuncture,acute assessment unit,acute care,acute medicine,...,wound healing,wound management,writing,xero,yield management,yoga,zoning,zoology,zurb foundation,y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Registered Nurses
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Clinical Nurse Specialists
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Personal Care Aides
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Registered Nurses
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Physical Medicine and Rehabilitation Physicians
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7985,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Registered Nurses
7986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chefs and Head Cooks
7987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Registered Nurses
7988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,General and Operations Managers


In [34]:
# sum frequency by categories

agg_data = skills_freq.groupby(['y']).sum()
agg_data = agg_data.T
agg_data

y,Accountants,Actuaries,Acute Care Nurses,"Administrative Law Judges, Adjudicators, and Hearing Officers",Administrative Services Managers,Advanced Practice Psychiatric Nurses,Advertising Sales Agents,Advertising and Promotions Managers,Aerospace Engineers,Anesthesiologists,...,Telemarketers,Training and Development Managers,Training and Development Specialists,Treasurers and Controllers,Tutors,Upholsterers,Veterinary Technologists and Technicians,"Vocational Education Teachers, Postsecondary",Waiters and Waitresses,Web Developers
abdomen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
absorption,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
acting,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
action research,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
active directory,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yield management,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yoga,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoning,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoology,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# gauge skill levels according to percentiles on frequency

job_skills_required = {}
for col in agg_data.columns:
    print(col)
    counts = agg_data[col][agg_data[col] != 0]
    if len(counts) == 0: # no relevant skills extracted
        job_skills_required[col] = dict()
    else:
        if len(counts.unique()) == 1: # all skills have same frequency
            series = counts
        else:
            series = agg_data[col][agg_data[col] > np.percentile(counts, 95)]
        if len(series.unique()) == 1: # all skills above 5th percentile have same frequency
            scaled_series = series.apply(lambda x: 1)
        else:
            scaled_series = series.apply(lambda x: (x - series.min()) / (series.max() - series.min()))
        binned_series = scaled_series.apply(lambda x: 5 if x > 0.7 else 4 if x > 0.3 else 3)
        job_skills_required[col] = binned_series.to_dict()

'''
with open('job_skills_required.pkl', 'wb') as handle:
  pickle.dump(job_skills_required, handle)
'''

Accountants
Actuaries
Acute Care Nurses
Administrative Law Judges, Adjudicators, and Hearing Officers
Administrative Services Managers
Advanced Practice Psychiatric Nurses
Advertising Sales Agents
Advertising and Promotions Managers
Aerospace Engineers
Anesthesiologists
Architects, Except Landscape and Naval
Architectural Drafters
Architectural and Engineering Managers
Assemblers and Fabricators, All Other
Assessors
Audio and Video Equipment Technicians
Audiologists
Auditors
Automotive Master Mechanics
Automotive Specialty Technicians
Baggage Porters and Bellhops
Bill and Account Collectors
Biochemists and Biophysicists
Biological Technicians
Biomedical Engineers
Bookkeeping, Accounting, and Auditing Clerks
Bus Drivers, Transit and Intercity
Bus and Truck Mechanics and Diesel Engine Specialists
Business Intelligence Analysts
Business Operations Specialists, All Other
Cardiovascular Technologists and Technicians
Cashiers
Chefs and Head Cooks
Chemical Technicians
Chemists
Chief Executive

"\nwith open('job_skills_required.pkl', 'wb') as handle:\n  pickle.dump(job_skills_required, handle)\n"

In [52]:
with open('resume_skills.pkl', 'rb') as pickle_file:
    resume_skills = pickle.load(pickle_file)

'''
with open('job_skills_required.pkl', 'rb') as pickle_file:
    job_skills_required = pickle.load(pickle_file)
'''

skills_gap_jobs = skills_gap_identification(resume_skills[31605080], job_skills_required['Web Developers'])
skills_gap_jobs

{'angular': [3],
 'c': [3, 4],
 'git': [3],
 'javascript': [3, 4, 5],
 'php': [3],
 'sql': [3]}

In [53]:
skills_gap_jobs = skills_gap_by_level(skills_gap_jobs)
skills_gap_jobs

{3: ['angular', 'c', 'git', 'javascript', 'php', 'sql'],
 4: ['c', 'javascript'],
 5: ['javascript']}

## 5. Pathway Generation

In [39]:
courses = pd.read_excel('Courses.xlsx')
courses = courses.fillna("")
courses['Description'] = courses['jobFamily'] + " " \
                         + courses['Marketing Name'] + " " \
                         + courses['courseName'] + " " \
                         + courses ['moduleName'] + " " \
                         + courses['courseDesc'] + " " \
                         + courses['Outcome Description'] + " " \
                         + courses['competencyUnitDesc']
courses = courses[['productId', 'Marketing Name', 'Description', 'jobFamily', 'competencyLevel']]
courses['Description'] = courses['Description'].astype(str)
courses['Description'] = courses['Description'].apply(preprocess)

In [50]:
courses.jobFamily.value_counts()

Software Design & Development                 514
Infocomm Sales & Marketing                    195
Infrastructure Support                        105
Business Analytics                            105
Digital Advertising / Digital Distribution    100
Infrastructure Architecture                    37
Enterprise Mobility                            17
Project Management                             16
Strategy and Architecture                      15
Service Innovation Design                      14
Infocomm Security                              13
Sales and Marketing                            12
Cloud Computing                                 9
Data Centre Management                          9
Product Management                              7
IT Management                                   5
Enterprise Network Design Management            5
IT Outsourcing Management                       1
Generic Skills                                  1
Name: jobFamily, dtype: int64

### 5.1 Doc2Vec

In [40]:
def tagcol_paragraph_embeddings_features(train_data):

    # Expects a dataframe with a 'Description' column
    train_data_values = train_data['Description'].values
    
    # Remember to use token.text to get the raw string, otherwise doc2vec cannot build vocabulary
    columns = [TaggedDocument([token.text for token in nlp(text) if token is not token.is_stop] , [i]) for i, text in enumerate(train_data_values)]
    
    return columns

corpus = tagcol_paragraph_embeddings_features(courses)
model = Doc2Vec(dm=0, vector_size=50, workers=multiprocessing.cpu_count(), min_count=2, epochs=100, hs=1, negative=0)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

Test Case

In [47]:
vector = model.infer_vector(skills_gap_jobs[3])

'''
vector = model.infer_vector(
    ['marketing',
     'microsoft office',
     'microsoft word',
     'negotiation',
     'planning',
     'process improvement',
     'procurement',
     'project management',
     'purchasing',
     'quality assurance',
     'quality control'])
'''

res = model.dv.most_similar([vector], topn=10)

In [48]:
course_unique = set()
course_list = []
for i, prob in res:
    if courses.loc[i, 'Marketing Name'] not in course_unique:
        course_unique.add(courses.loc[i, 'Marketing Name'])
        course_list.append((courses.loc[i, 'productId'], courses.loc[i, 'Marketing Name'], courses.loc[i, 'competencyLevel']))
course_list

[(3401.0, 'Express Web Developer (C#)', '3 - Entrant Level'),
 (3006.0, 'NICF - C# Programming and .NET MVC', '4 - Specialist Level'),
 (3005.0, 'C# Programming and .NET MVC', '4 - Specialist Level'),
 (3501.0, 'Express Web Developer (PHP)', '4 - Specialist Level')]

### 5.2 SpaCy

In [55]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')

for level in skills_gap_jobs:
  skills_gap_text = " ".join(skills_gap_jobs[level])

  # get courses of same competency level
  df = courses.copy()
  df = df[df["competencyLevel"].str.contains(str(level))]

  # get similarity score
  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
  top_courses = df.nlargest(5,'Similarity',keep='all')
  print("Level " + str(level) + ":")
  print("Skills to bridge: " + skills_gap_text)
  for index, row in top_courses.iterrows():
    print(row["productId"],
          row["Marketing Name"],
          round(row["Similarity"],2))
  print("\n")

  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))


Level 3:
Skills to bridge: angular c git javascript php sql
7316.0 Front End Web Development 0.85
1401.0 NICF - Advanced Certificate in Web Development 0.84
14302.0 New Hire and Redeployment – Full Stack Web Developer 0.84
11217.0 ACWD-CAT B 0921A 0.83
5001.0 Data Intelligence 0.83


Level 4:
Skills to bridge: c javascript
6905.0 Full Stack Web Developer Bootcamp - International 0.65
7701.0 Full Stack Web Development Bootcamp 0.65
1746.0 NICF - Java EE Frameworks 0.65
5906.0 Full Stack Web Developer Bootcamp (Sri Lanka) 0.65
9601.0 Applied Master in Analytics & Artificial Intelligence 0.65


Level 5:
Skills to bridge: javascript
203.0 Private Cloud 0.52
12502.0 Cyber Security Management Capstone Project 0.5
3314.0 Express ERP SAP SD 0.48
1501.0 Digital Recruiter 0.48
6606.0 Express ERP SAP Sales and Distribution 0.48


