## 1. Importing Libraries/Tools

In [8]:
%pip install gensim
%pip install spacy
%pip install aspose-words

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [9]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import aspose.words as aw
import gensim
import json
import multiprocessing
import numpy as np
import pandas as pd
import pickle
import re
import sklearn
import spacy

from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import MultiLabelBinarizer
from spacy.tokenizer import Tokenizer

## 2. Load Relevant Data

In [2]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# Tokenize words only with the whitespace rule
# N-grams will no longer be treated as 'N' and '-grams'
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

In [3]:
skills_api = pd.read_excel('all_skills_emsi.xlsx')
skills_api['name'] = skills_api['name'].apply(lambda x: re.sub("\W?\(.*?\)","",x))
skills_api['name'] = skills_api['name'].apply(lambda x: x.strip().lower())
skills_api['lemmatized_name'] = skills_api['name'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
skills_api = set(skills_api['name']).union(set(skills_api['lemmatized_name']))

# with open('skills_api.pkl', 'wb') as handle:
#   pickle.dump(skills_api, handle)

In [4]:
df1 = pd.read_csv('Resume.csv')
# standardise column names
df1['Text'] = df1['Resume_str']
df1 = df1[['ID','Category','Text']]

df2 = pd.read_csv('Job Description.csv')
# get only english job descriptions
df2 = df2[df2['job_post_lang'].str.lower() == 'en-gb'] 
# standardise column names
df2['ID'] = df2['uniq_id']
df2['Category'] = df2['category']
df2['Text'] = df2['job_requirements']
df2 = df2[['ID','Category','Text']]
# drop empty rows
df2 = df2.dropna().reset_index(drop=True) 

In [5]:
df1["Category"].value_counts()

INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: Category, dtype: int64

In [6]:
df2["Category"].value_counts()

Registered Nurses                                    2513
Personal Care Aides                                   508
Social and Human Service Assistants                   415
Home Health Aides                                     393
Medical and Health Services Managers                  253
                                                     ... 
Health Technologists and Technicians, All Other         1
Electronics Engineering Technicians                     1
Production Workers, All Other                           1
Insurance Adjusters, Examiners, and Investigators       1
Computer Network Architects                             1
Name: Category, Length: 291, dtype: int64

## 3. Skills Extraction

Common Skills Extraction Functions

In [7]:
def preprocess(txt: str) -> str:
    txt = txt.lower()
    # these must come first
    txt = re.sub('b\.\S*', '', txt) # remove all bachelor qualifications
    txt = re.sub('m\.\S*', '', txt) # remove all master qualifications
    # then these
    txt = txt.replace("'","").replace("’","") # remove apostrophes
    txt = re.sub('<.*?>',' ',txt) # remove <> tags
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', '  ', txt)  # remove mentions
    txt = re.sub('[^a-zA-Z]', ' ', txt) # Remove non-English characters
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace

    # tokenize word
    txt = nlp(txt)

    # remove stop words
    txt = [token.text for token in txt if not token.is_stop]

    return ' '.join(txt)

def n_grams(tokens: list[str], n: int) -> list[str]:
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def generate_list_of_skills(text: str) -> list[str]:
    nlp_text = nlp(text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    # all the resume skills will be saved here
    skillset = []
        
    # check for one-grams (example: python)
    for token in tokens:
        skillset.append(token)
        
    # check for noun_chunks (example: machine learning)
    for token in nlp_text.noun_chunks:
        token = token.text.lower().strip()
        skillset.append(token)

    # check for N-grams that SpaCy missed in the noun_chuncks
    for n in range(2, 5):
        for token in n_grams(tokens, n):
            token = token.lower().strip()
            skillset.append(token)

    return skillset

def extract_skills(resume_text: str, skills_api: set, clean: bool=True) -> set:
    if clean == True:
        resume_text = preprocess(resume_text)
    
    # create a set of skills in lowercase from the resume
    skillset = set([i for i in set([i.lower() for i in generate_list_of_skills(resume_text)])])
    # find all valid skills using Skills API data
    return skillset.intersection(skills_api)

### 3.1 User Skills Extraction

Functions

The following functions are used to identify and extract skills from a user's resume, calculate the years of experience candidate has in said skills based on the dates provided and the final result will be a dictionary where the key is the identified skill and the value is the number of years of experience the candidate has.

In [8]:
def section_break(original_resume_text: str) -> list[str]:
    pattern = r'\b[A-Z]+\b'
    # find all the words that are fully uppercased
    res = list(re.finditer(pattern, original_resume_text))

    # find only those uppercased words that are also NOUN
    res = [x for x in res if nlp(x.group())[0].pos_ == 'NOUN']
    
    ans = []

    # if there is no uppercased NOUN
    if len(res) == 0:
        ans.append(original_resume_text)
    # if there is just one uppercased NOUN
    elif len(res) == 1:
        ans.append(original_resume_text[res[0].span()[1]:])
    else:
        i = 1
        while i < len(res):
            ans.append(original_resume_text[res[i-1].span()[1]:res[i].span()[0]])
            i += 1
        ans.append(original_resume_text[res[i-1].span()[1]:])
    return ans

def date_search(resume: str) -> list[list]:
    ans = []

    # find all the date occurrence based on the regular expression
    pattern = r'(((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Nov(ember)?|Dec(ember)?)|(\d{1,2}\s?\/){0,2}|(\d{1,2}\s?\-){0,2})\s?[-/ ]?\s?\d{4}?)|\bPresent\b|\bpresent\b|\bCurrent\b|\bcurrent\b'
    res = list(re.finditer(pattern, resume))

    if len(res) > 1:
        for ele in res:
            # this is to eradicate the results of having only year but without month
            if len(ele.group().strip()) > 5:
                ans.append([ele.start(), ele.end(), ele.group().strip()])

    res = []
    # Convert "present" and "current" to today's date
    for ele in ans:
        if ele[2].lower() == 'present' or ele[2].lower() == 'current':
            today = pd.to_datetime('today').date()
            ele[2] = today
            res.append(ele)
        else:
            # catch DateParse Error here
            try:
                day = pd.to_datetime(ele[2]).date()
                ele[2] = day
                res.append(ele)
            except :
                print('Cannot parse the date: ', ele[2])

    
    # all the date results are given in the form of [datetime_start_index, datetime_end_index, datetime]
    return res

def experience_tagging(date_list: list[list]) -> list[tuple]:
    i = 1
    cleaned_section = {}

    while i < len(date_list):

        prev = date_list[i-1] # previous date
        cur = date_list[i] # current date
        
        # if current date is within 10 characters from the previous date
        if cur[0] < prev[1] + 10:
            # Taking ceiling of the year of experience
            key = ((cur[2] - prev[2]).days // 365) + 1
            
            # section starts at the (end index of the current date) + 1
            frm = cur[1]+1

            if i < len(date_list) - 1:
                # if there is another date that appears later, then the section will be until the start index of the next date
                until = date_list[i+1][0]
            else:
                # else the section will be until the end of the resume
                until = -1
            
            # Multiple projects with same year of experience, we do 'chaining' here
            if key in cleaned_section:
                cleaned_section[key].append((frm, until))
            else:
                cleaned_section[key] = [(frm, until)]
            i += 2

        else:
            # ignore the current date, possibly it is useless
            i += 1

    return cleaned_section

def skills_experience_level_identification(resume: str, skills_api: set, clean: bool=True) -> dict:
    res = {}
    # break down resume string into sections
    sections = section_break(resume)

    for section in sections:
        date_list = date_search(section)
        experience_sec = experience_tagging(date_list)
        for key in experience_sec:
            # for each section (`start` to `end`) that has the year of experience of `key`
            for start, end in experience_sec[key]:
                # find all the skills within the section
                skills_list = extract_skills(section[start:end], skills_api, clean)
                # for each skill, tag the maximum year of experience
                for ele in skills_list:
                    if ele not in res:
                        res[ele] = key
                    else:
                        res[ele] = max(key, res[ele])

    # for all the skills which do not have any level of experience, assign a default value of 1
    skills_list = extract_skills(resume, skills_api, True)
    for ele in skills_list:
        if ele not in res:
            res[ele] = 1

    # return dictionary with sorted keys, `key` is the skill. `value` is the year of experience
    return dict(sorted(res.items(), key = lambda x:x[1], reverse = True))

Test Cases

In [9]:
# Test Case, Output will be a dictionary showing the skills as the key and the years of experience as the value. 
test_user = df1.iloc[300,2]
test_user_skills = skills_experience_level_identification(test_user, skills_api)
print(df1.iloc[300,1])
test_user_skills

INFORMATION-TECHNOLOGY


{'operations': 18,
 'technical support': 1,
 'desktop support': 1,
 'virus': 1,
 'adware': 1,
 'communication': 1,
 'network support': 1,
 'information technology': 1,
 'plan': 1,
 'color': 1,
 'netscaler': 1,
 'system center configuration manager': 1,
 'operating systems': 1,
 'desktop environment': 1,
 'b': 1,
 'business continuity': 1,
 'ghost': 1,
 'reduction': 1,
 'windows desktop': 1,
 'application deployment': 1,
 'business technology': 1,
 'group policy': 1,
 'security': 1,
 'microsoft certified technology specialist': 1,
 'capacity planning': 1,
 'spyware': 1,
 'streamlining': 1,
 'security policies': 1,
 'management information systems': 1,
 'adobe acrobat': 1,
 'active directory': 1,
 'web servers': 1,
 'disaster recovery': 1,
 'laser printers': 1,
 'citrix certified administrator': 1,
 'coordinate': 1,
 'web development': 1,
 'information systems': 1,
 'management': 1,
 'reliability': 1,
 'laser': 1,
 'project management': 1,
 'planning': 1,
 'r': 1,
 'troubleshooting': 1}

### 3.2 Required Skills Extraction

The next portion will be to extract the skills needed to either i) meet a particular job's requirement or ii) meet the requirement to be on par with candidates with the same job role. We will determine the proficiency needed for each skill by the percentage of resume/job postings having the skill.

The output is a dictionary where the key is the role the candidate is seeking and the value tagged to it is a dictionary with the required skill as the key and the proficiency level tagged as the value. Since the courses dataset that is explored further on has a competency level of either 3, 4 or 5, all proficiency levels will either be 3, 4 or 5 to ease comparision of skills gap.

Functions

In [10]:
def get_significance_table(df: pd.DataFrame, skills_dic: dict) -> pd.DataFrame:
    
    # creates a binary matrix indicating the presence of each skill
    df['Skills'] = df['ID'].apply(lambda x: list(skills_dic[x]))
    mlb = MultiLabelBinarizer()
    table = pd.DataFrame(mlb.fit_transform(df['Skills']),
                         columns=mlb.classes_,
                         index=df['Skills'].index)
    
    # add category column as y
    y = df['Category']
    table['y'] = y

    # sum by category column and divide by total number of instances
    agg_table = table.groupby(['y']).sum()
    agg_table = agg_table.T / table.groupby(['y']).size()

    # return a panda dataframe that has skills as rows, industry/job role as column
    return agg_table

def find_significant_skills(agg_table: pd.DataFrame) -> dict:

    # gauge skill levels according to percentiles
    skills_required = {}
    for col in agg_table.columns:
        # we only apply percentile method on skills that appear at least once
        skills = agg_table[col][agg_table[col] > 0]

        # no skills extracted > skills required = empty dictionary
        if len(skills) == 0: 
            skills_required[col] = dict()
        
        # some skills extracted > skills required = some dictionary
        else:
            series = agg_table[col][agg_table[col] >= np.percentile(skills, 95)]

            # if all skills above 95th percentile have same frequency, then scale them to 1 (max)
            if len(series.unique()) == 1:
                scaled_series = series.apply(lambda x: 1)
            # if skills above 95th percentile have different frequency, then scale to values between 0 and 1
            else:
                scaled_series = series.apply(lambda x: (x - series.min()) / (series.max() - series.min()))

            # bin skills according to percentiles
            binned_series = scaled_series.apply(lambda x: 5 if x > 0.7 else 4 if x > 0.3 else 3)
            
            # convert series to dictionary form
            skills_required[col] = binned_series.to_dict()

    # return a dictionary which `key` is the industry/job role and `value` is the dictionary with the corresponding competency level for each skill
    return skills_required

##### 3.2.1 From Resume

In [11]:
# extract skills from all rows > store in dictionary > pickle dump

# df1["Skills"] = df1["Text"].apply(lambda x: extract_skills(x,skills_api,True))
# resume_skills_dic = {}
# for index, row in df1.iterrows():
#   resume_skills_dic[row.ID] = row.Skills

# with open('resume_skills.pkl', 'wb') as handle:
#   pickle.dump(resume_skills_dic, handle)

In [12]:
with open('resume_skills.pkl', 'rb') as pickle_file:
    resume_skills_dic = pickle.load(pickle_file)
resume_df = df1.copy()

resume_table = get_significance_table(resume_df,resume_skills_dic)
resume_table

y,ACCOUNTANT,ADVOCATE,AGRICULTURE,APPAREL,ARTS,AUTOMOBILE,AVIATION,BANKING,BPO,BUSINESS-DEVELOPMENT,...,DIGITAL-MEDIA,ENGINEERING,FINANCE,FITNESS,HEALTHCARE,HR,INFORMATION-TECHNOLOGY,PUBLIC-RELATIONS,SALES,TEACHER
abaqus,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.016949,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
ablation,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.008621,0.000000
ableton live,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.010417,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
abnormal psychology,0.0,0.008475,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.008696,0.009091,0.0,0.0,0.000000,0.000000
absorption,0.0,0.000000,0.015873,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoning,0.0,0.000000,0.000000,0.030928,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.009804
zoology,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
zoom,0.0,0.000000,0.015873,0.000000,0.000000,0.0,0.0,0.0,0.0,0.008333,...,0.000000,0.000000,0.0,0.008547,0.000000,0.000000,0.0,0.0,0.000000,0.000000
zumba,0.0,0.000000,0.000000,0.000000,0.009709,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.051282,0.000000,0.000000,0.0,0.0,0.000000,0.000000


In [13]:
resume_skills_required = find_significant_skills(resume_table)

# with open('resume_skills_required.pkl', 'wb') as handle:
#   pickle.dump(resume_skills_required, handle)

resume_skills_required

{'ACCOUNTANT': {'audit': 4,
  'auditing': 3,
  'balance': 4,
  'balance sheet': 3,
  'bank reconciliations': 3,
  'banking': 3,
  'billing': 4,
  'bookkeeping': 3,
  'budget': 4,
  'budgeting': 3,
  'business administration': 4,
  'cash flow': 3,
  'close': 3,
  'closing': 4,
  'collections': 3,
  'communication': 4,
  'customer service': 3,
  'depreciation': 3,
  'detail oriented': 3,
  'file': 3,
  'filing': 3,
  'finance': 4,
  'financial analysis': 3,
  'financial statement': 3,
  'financial statements': 4,
  'forecast': 3,
  'forecasting': 3,
  'general ledger': 4,
  'invoice': 3,
  'invoicing': 3,
  'journal': 4,
  'leadership': 3,
  'ledger': 5,
  'level': 3,
  'm': 3,
  'management': 5,
  'microsoft office': 4,
  'operations': 4,
  'organizational skills': 3,
  'plan': 3,
  'planning': 3,
  'problem solving': 3,
  'purchase': 3,
  'quickbooks': 4,
  'r': 3,
  'reconciliation': 4,
  'research': 3,
  'sales': 4,
  'sales tax': 3,
  'spreadsheets': 3,
  'tax returns': 3,
  'time m

##### 3.2.2 From Job Descriptions

In [14]:
# extract skills from all rows > store in dictionary > pickle dump

# df2["Skills"] = df2["Text"].apply(lambda x: extract_skills(x,skills_api,True))
# job_skills_dic = {}
# for index, row in df2.iterrows():
#   job_skills_dic[row.ID] = row.Skills

# with open('job_skills.pkl', 'wb') as handle:
#   pickle.dump(job_skills_dic, handle)

In [15]:
with open('job_skills.pkl', 'rb') as pickle_file:
    job_skills_dic = pickle.load(pickle_file)
job_df = df2.copy()

job_table = get_significance_table(job_df,job_skills_dic)
job_table

y,Accountants,Actuaries,Acute Care Nurses,"Administrative Law Judges, Adjudicators, and Hearing Officers",Administrative Services Managers,Advanced Practice Psychiatric Nurses,Advertising Sales Agents,Advertising and Promotions Managers,Aerospace Engineers,Anesthesiologists,...,Telemarketers,Training and Development Managers,Training and Development Specialists,Treasurers and Controllers,Tutors,Upholsterers,Veterinary Technologists and Technicians,"Vocational Education Teachers, Postsecondary",Waiters and Waitresses,Web Developers
abdomen,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
absorption,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
academic standards,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
acoustic,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
acoustics,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yoga,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
zone,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
zoning,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
zoology,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [16]:
job_skills_required = find_significant_skills(job_table)

# with open('job_skills_required.pkl', 'wb') as handle:
#   pickle.dump(job_skills_required, handle)

job_skills_required

{'Accountants': {'audit': 3,
  'communication': 5,
  'finance': 4,
  'management': 4},
 'Actuaries': {'actuarial exams': 5,
  'communication': 5,
  'covenant': 5,
  'investment': 5},
 'Acute Care Nurses': {'management': 5,
  'nursing': 3,
  'planning': 3,
  'registration': 3,
  'rehabilitation': 3,
  'scheme': 4},
 'Administrative Law Judges, Adjudicators, and Hearing Officers': {'advise': 5,
  'comment': 5,
  'management': 5,
  'triage': 5},
 'Administrative Services Managers': {'c': 5,
  'customer engagement': 5,
  'customer service': 5,
  'grade': 5,
  'head': 5,
  'procurement': 5,
  'scheme': 5},
 'Advanced Practice Psychiatric Nurses': {'act': 4,
  'mental health': 5,
  'rehabilitation': 3,
  'supervision': 3},
 'Advertising Sales Agents': {'b': 5, 'sales': 5},
 'Advertising and Promotions Managers': {'brand': 5,
  'brand identity': 5,
  'budget': 5,
  'facebook': 5,
  'licensing': 5,
  'linkedin': 5,
  'management': 5,
  'marketing': 5,
  'marketing copy': 5,
  'merchandise': 5,

## 4. Skills Gap Identification

Skills Gap Functions

The following portion is our algorithm to idenitfy exactly how far the candidate is from obtaining their desired occupation.

We also added a portion that makes comparison between candidates as we believe that it is also crucial for the candidate to understand where he/she stands in the current market in order for them to obtain their desired occupation. 

In [17]:
# Map years of experience to competency level of either 3, 4 or 5
# if year of experience (YOE) <= 2, then assign entry level; 2 < YOE <= 5, then assign specialist level; YOE > 5, then assign expert level
def user_level_deduction(years: int) -> int:
    if years <= 2:
        return 3
    elif years <= 5:
        return 4
    else:
        return 5

# return skills gap grouped by skills
def skills_gap_identification(skills: dict, skills_required: dict) -> dict:
    diff = {}
    # compare the skill required vs the skill from resumes
    for key in skills_required:
        # if the applicant does not have the skill, then he needs to start picking up from level 3
        if key not in skills:
            diff[key] = [x for x in range(3, skills_required[key] + 1)]
        # if the applicant has the skill, find what is his competency level, then suggest him all the subsequent competency levels 
        else:
            user_level = user_level_deduction(skills[key])
            if user_level < skills_required[key]:
                diff[key] = [x for x in range(user_level + 1, skills_required[key] + 1)]
    # return a dictionary with `key` as the skill that need to be bridged and `value` as the difference in competency levels
    return diff

### 4.1 Gaps between User and Job Requirements

In [18]:
with open('job_skills_required.pkl', 'rb') as pickle_file:
    job_skills_required = pickle.load(pickle_file)

skills_gap_jobs = skills_gap_identification(test_user_skills, job_skills_required['Web Developers'])
skills_gap_jobs

{'android': [3],
 'angular': [3],
 'c': [3, 4],
 'git': [3],
 'java': [3],
 'javascript': [3, 4, 5],
 'jquery': [3],
 'level': [3],
 'php': [3],
 'research': [3],
 'software development': [3],
 'sql': [3]}

### 4.2 Gaps between User and Other Candidates

In [19]:
with open('resume_skills_required.pkl', 'rb') as pickle_file:
    resume_skills_required = pickle.load(pickle_file)

skills_gap_cand = skills_gap_identification(test_user_skills, resume_skills_required['AVIATION'])
skills_gap_cand

{'aircraft maintenance': [3],
 'ammunition': [3],
 'aviation': [3, 4, 5],
 'balance': [3],
 'budget': [3],
 'business administration': [3],
 'c': [3],
 'communication': [4],
 'communications': [3],
 'construction': [3],
 'control systems': [3],
 'coordinating': [3],
 'critical thinking': [3],
 'customer service': [3, 4],
 'data entry': [3],
 'electronic': [3],
 'electronics': [3],
 'filing': [3],
 'hydraulic': [3],
 'installation': [3],
 'inventory control': [3],
 'inventory management': [3],
 'leadership': [3, 4],
 'level': [3, 4],
 'license': [3],
 'logistics': [3, 4],
 'm': [3],
 'machine': [3],
 'management': [4, 5],
 'marketing': [3],
 'mechanic': [3],
 'mechanics': [3],
 'merchandise': [3],
 'microsoft office': [3],
 'microsoft word': [3],
 'operation': [3, 4],
 'paperwork': [3],
 'planning': [4],
 'presentations': [3],
 'problem solving': [3],
 'process improvement': [3],
 'procurement': [3],
 'profile': [3],
 'purchase': [3],
 'purchasing': [3],
 'quality assurance': [3],
 'qua

## 5. Pathway Generation

Now that the gaps have been identified, it is time for us to generate a learning path of courses that the user can take in order for them to bridge it. There are 2 different methods used in this notebook, Dot2Vec and Spacy.

In [20]:
# Preprocess the courses dataset (provided by Sambaash) and add all the other columns to `Description` so that we can use Doc2Vec to infer word vector
courses = pd.read_excel('Courses.xlsx')
courses = courses.fillna('')
courses['Description'] = courses['jobFamily'] + ' ' \
                         + courses['Marketing Name'] + ' ' \
                         + courses['courseName'] + ' ' \
                         + courses ['moduleName'] + ' ' \
                         + courses['courseDesc'] + ' ' \
                         + courses['Outcome Description'] + ' ' \
                         + courses['competencyUnitDesc']
courses = courses[['productId', 'Marketing Name', 'Description', 'jobFamily', 'competencyLevel']]
courses['Description'] = courses['Description'].astype(str)
courses['Description'] = courses['Description'].apply(preprocess)

In [21]:
courses.jobFamily.value_counts()

Software Design & Development                 514
Infocomm Sales & Marketing                    195
Infrastructure Support                        105
Business Analytics                            105
Digital Advertising / Digital Distribution    100
Infrastructure Architecture                    37
Enterprise Mobility                            17
Project Management                             16
Strategy and Architecture                      15
Service Innovation Design                      14
Infocomm Security                              13
Sales and Marketing                            12
Cloud Computing                                 9
Data Centre Management                          9
Product Management                              7
IT Management                                   5
Enterprise Network Design Management            5
IT Outsourcing Management                       1
Generic Skills                                  1
Name: jobFamily, dtype: int64

Functions

In [22]:
# group skills gap by level
# this is to convert `key` to `value`, and `value` to `key`
def skills_gap_by_level(skills_gap: dict) -> dict:
    new_skills_gap = {}
    for skill in skills_gap:
        for level in skills_gap[skill]:
            if level in new_skills_gap:
                new_skills_gap[level].append(skill)
            else:
                new_skills_gap[level] = [skill]
    return new_skills_gap

### 5.1 Doc2Vec

In [23]:
# Tag all `Description` values to train Doc2Vec model
def tagcol_paragraph_embeddings_features(train_data: pd.DataFrame) -> list[TaggedDocument]:

    # Expects a dataframe with a 'Description' column
    train_data_values = train_data['Description'].values
    
    # Remember to use token.text to get the raw string, otherwise doc2vec cannot build vocabulary
    columns = [TaggedDocument([token.text for token in nlp(text) if token is not token.is_stop] , [i]) for i, text in enumerate(train_data_values)]
    
    return columns

# Train Doc2Vec model and save to a pickle file
corpus = tagcol_paragraph_embeddings_features(courses)
d2v_model = Doc2Vec(dm=0, vector_size=50, workers=multiprocessing.cpu_count(), min_count=2, epochs=100, hs=1, negative=0)
d2v_model.build_vocab(corpus)
d2v_model.train(corpus, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)
d2v_model.save("d2v_model.pkl")

Test Case

In [24]:
# suggesting courses using Doc2Vec model 
def course_suggestion_d2v(doc2vec_model: Doc2Vec, skills_gap_cand: dict, skills_gap_jobs: dict, courses_dataset: pd.DataFrame) -> dict:
    ans = {'job': {}, 'continuous learning': {}}

    # compare application skillset with job required skill
    for level in skills_gap_jobs:
        # infer a wordvector from all the lacking skills in one competency level
        vector = doc2vec_model.infer_vector(skills_gap_jobs[level])
        # find the top 20 courses that have the most similar wordvector
        res = doc2vec_model.dv.most_similar([vector], topn=20)
        course_unique = set()
        course_list = []

        # filter out courses with similar name
        for i, prob in res:
            if courses_dataset.loc[i, 'competencyLevel'][0] == str(level) and courses_dataset.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses_dataset.loc[i, 'Marketing Name'])
                # create the course list per competency level in JSON format
                course_list.append({
                    "Course ID": courses_dataset.loc[i, 'productId'],
                    "Course Name": courses_dataset.loc[i, 'Marketing Name'],
                    "Couse Level": courses_dataset.loc[i, 'competencyLevel']
                })
        # select the top 5 courses only
        ans['job'][level] = course_list[:5]

    # compare application skillset with peer required skill
    for level in skills_gap_cand:
        # infer a wordvector from all the lacking skills in one competency level
        vector = doc2vec_model.infer_vector(skills_gap_cand[level])
        res = doc2vec_model.dv.most_similar([vector], topn=20)
        course_unique = set()
        course_list = []
        # filter out courses with similar name
        for i, prob in res:
            if courses_dataset.loc[i, 'competencyLevel'][0] == str(level) and courses_dataset.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses_dataset.loc[i, 'Marketing Name'])
                # create the course list per competency level in JSON format
                course_list.append({
                    "Course ID": courses_dataset.loc[i, 'productId'],
                    "Course Name": courses_dataset.loc[i, 'Marketing Name'],
                    "Couse Level": courses_dataset.loc[i, 'competencyLevel']
                })
        # select the top 5 courses only
        ans['continuous learning'][level] = course_list[:5]

    # return a dictionary with top 5 courses for each competency level
    return ans

In [25]:
course_suggestion_d2v(d2v_model, skills_gap_by_level(skills_gap_cand), skills_gap_by_level(skills_gap_jobs), courses)

{'job': {3: [{'Course ID': 3402.0,
    'Course Name': 'Express Web Developer (PHP)',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 3401.0,
    'Course Name': 'Express Web Developer (C#)',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 5010.0,
    'Course Name': 'Agile Innovation Primer',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 3504.0,
    'Course Name': 'Express Web Developer (JAVA)',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 3801.0,
    'Course Name': 'Express Web Developer (Python)',
    'Couse Level': '3 - Entrant Level'}],
  4: [],
  5: []},
 'continuous learning': {3: [{'Course ID': 7319.0,
    'Course Name': 'Database Design and Implementation',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 546.0,
    'Course Name': 'Coding with Python',
    'Couse Level': '3 - Entrant Level'}],
  4: [{'Course ID': 422.0,
    'Course Name': 'Support and Maintain IT Infrastructure',
    'Couse Level': '4 - Specialist Level'},
   

### 5.2 SpaCy

In [26]:
spacy_model = spacy.load('en_core_web_sm')

Test Case

In [27]:
# different method of doing learning path generation, but this method is too slow
def course_suggestion_spacy(spacy_model: spacy.lang, skills_gap_cand: dict, skills_gap_jobs: dict, courses_dataset: pd.DataFrame) -> dict:
    ans = {'job': {}, 'continuous learning': {}}

    for level in skills_gap_jobs:
        skills_gap_text = " ".join(skills_gap_jobs[level])

        # get courses of same competency level
        df = courses_dataset[courses_dataset["competencyLevel"].str.contains(str(level))]
        df = df.copy()

        # get similarity score
        df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))
        top_courses = df.nlargest(20,'Similarity',keep='all')
        course_unique = set()
        course_list = []
        for index, row in top_courses.iterrows():
            if row['Marketing Name'] not in course_unique:
                course_unique.add(row['Marketing Name'])
                # course_list.append((row['productId'], row['Marketing Name'], row['competencyLevel']))
                course_list.append({
                    "Course ID": row['productId'],
                    "Course Name": row['Marketing Name'],
                    "Couse Level": row['competencyLevel']
                })
        ans['job'][level] = course_list[:5]

    for level in skills_gap_cand:
        skills_gap_text = " ".join(skills_gap_cand[level])

        # get courses of same competency level
        df = courses_dataset[courses_dataset["competencyLevel"].str.contains(str(level))]
        df = df.copy()

        # get similarity score
        df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))
        top_courses = df.nlargest(20,'Similarity',keep='all')
        course_unique = set()
        course_list = []
        for index, row in top_courses.iterrows():
            if row['Marketing Name'] not in course_unique:
                course_unique.add(row['Marketing Name'])
                # course_list.append((row['productId'], row['Marketing Name'], row['competencyLevel']))
                course_list.append({
                    "Course ID": row['productId'],
                    "Course Name": row['Marketing Name'],
                    "Couse Level": row['competencyLevel']
                })
        ans['continuous learning'][level] = course_list[:5]

    return ans

In [28]:
course_suggestion_spacy(spacy_model, skills_gap_by_level(skills_gap_cand), skills_gap_by_level(skills_gap_jobs), courses)

  df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))
  df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))


{'job': {3: [{'Course ID': 7316.0,
    'Course Name': 'Front End Web Development',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 1401.0,
    'Course Name': 'NICF - Advanced Certificate in Web Development',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 1744.0,
    'Course Name': 'UI Frameworks (SF)',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 1743.0,
    'Course Name': 'Front-End Web Development',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 14302.0,
    'Course Name': 'New Hire and Redeployment – Full Stack Web Developer',
    'Couse Level': '3 - Entrant Level'}],
  4: [{'Course ID': 4504.0,
    'Course Name': 'Pearson BTEC HND Level 5 Diploma in Computing (RQF)',
    'Couse Level': '4 - Specialist Level'},
   {'Course ID': 7701.0,
    'Course Name': 'Full Stack Web Development Bootcamp',
    'Couse Level': '4 - Specialist Level'},
   {'Course ID': 4401.0,
    'Course Name': 'Data Analytics with Hadoop',
    'Couse Level': '4 - Specia

## 6. Final Function

For the final portion, we complie all our functions into a single function and the final output from our model is a dictionary where the key is the competency level (3, 4 or 5) and the value is the recommended courses for the user at said skill level. 

Make sure that you have `all_skills_emsi.xlsx`, `Courses.xlsx`, `resume_skills_required.pkl`, `job_skills_required.pkl` and `d2v_model.pkl`, and a sample resume `11592605.pdf`

### 6.1 Libraries Required

In [None]:
import aspose.words as aw
import gensim
import json
import multiprocessing
import numpy as np
import pandas as pd
import pickle
import re
import sklearn
import spacy

from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import MultiLabelBinarizer
from spacy.tokenizer import Tokenizer

### 6.2 Functions Required

In [17]:
def create_skills_api(skills_api_filename: str) -> set:
    skills_api = pd.read_excel(skills_api_filename)
    skills_api['name'] = skills_api['name'].apply(lambda x: re.sub("\W?\(.*?\)","",x))
    skills_api['name'] = skills_api['name'].apply(lambda x: x.strip().lower())
    skills_api['lemmatized_name'] = skills_api['name'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    skills_api = set(skills_api['name']).union(set(skills_api['lemmatized_name']))
    return skills_api

def read_resume(resume_filename: str) -> str:
    resume = aw.Document(resume_filename)
    resume_string = resume.to_string(aw.SaveFormat.TEXT).split('\r\n')
    resume_string = ' '.join(resume_string[1:-3])
    return resume_string

def preprocess(txt: str) -> str:
    txt = txt.lower()
    # these must come first
    txt = re.sub('b\.\S*', '', txt) # remove all bachelor qualifications
    txt = re.sub('m\.\S*', '', txt) # remove all master qualifications
    # then these
    txt = txt.replace("'","").replace("’","") # remove apostrophes
    txt = re.sub('<.*?>',' ',txt) # remove <> tags
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', '  ', txt)  # remove mentions
    txt = re.sub('[^a-zA-Z]', ' ', txt) # Remove non-English characters
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace

    # tokenize word
    txt = nlp(txt)

    # remove stop words
    txt = [token.text for token in txt if not token.is_stop]

    return ' '.join(txt)

def n_grams(tokens: list[str], n: int) -> list[str]:
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def generate_list_of_skills(text: str) -> list[str]:
    nlp_text = nlp(text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    # all the resume skills will be saved here
    skillset = []
        
    # check for one-grams (example: python)
    for token in tokens:
        skillset.append(token)
        
    # check for noun_chunks (example: machine learning)
    for token in nlp_text.noun_chunks:
        token = token.text.lower().strip()
        skillset.append(token)

    # check for N-grams that SpaCy missed in the noun_chuncks
    for n in range(2, 5):
        for token in n_grams(tokens, n):
            token = token.lower().strip()
            skillset.append(token)

    return skillset

def extract_skills(resume_text: str, skills_api: pickle, clean: bool=True) -> set:
    if clean == True:
        resume_text = preprocess(resume_text)
    
    # create a set of skills in lowercase from the resume
    skillset = set([i for i in set([i.lower() for i in generate_list_of_skills(resume_text)])])
    # find all valid skills using Skills API data
    return skillset.intersection(skills_api)

def section_break(original_resume_text: str) -> list[str]:
    pattern = r'\b[A-Z]+\b'
    # find all the words that are fully uppercased
    res = list(re.finditer(pattern, original_resume_text))

    # find only those uppercased words that are also NOUN
    res = [x for x in res if nlp(x.group())[0].pos_ == 'NOUN']
    
    ans = []

    # if there is no uppercased NOUN
    if len(res) == 0:
        ans.append(original_resume_text)
    # if there is just one uppercased NOUN
    elif len(res) == 1:
        ans.append(original_resume_text[res[0].span()[1]:])
    else:
        i = 1
        while i < len(res):
            ans.append(original_resume_text[res[i-1].span()[1]:res[i].span()[0]])
            i += 1
        ans.append(original_resume_text[res[i-1].span()[1]:])
    return ans

def date_search(resume: str) -> list[list]:
    ans = []

    # find all the date occurrence based on the regular expression
    pattern = r'(((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Nov(ember)?|Dec(ember)?)|(\d{1,2}\s?\/){0,2}|(\d{1,2}\s?\-){0,2})\s?[-/ ]?\s?\d{4}?)|\bPresent\b|\bpresent\b|\bCurrent\b|\bcurrent\b'
    res = list(re.finditer(pattern, resume))

    if len(res) > 1:
        for ele in res:
            # this is to eradicate the results of having only year but without month
            if len(ele.group().strip()) > 5:
                ans.append([ele.start(), ele.end(), ele.group().strip()])

    res = []
    # Convert "present" and "current" to today's date
    for ele in ans:
        if ele[2].lower() == 'present' or ele[2].lower() == 'current':
            today = pd.to_datetime('today').date()
            ele[2] = today
            res.append(ele)
        else:
            # catch DateParse Error here
            try:
                day = pd.to_datetime(ele[2]).date()
                ele[2] = day
                res.append(ele)
            except :
                print('Cannot parse the date: ', ele[2])

    
    # all the date results are given in the form of [datetime_start_index, datetime_end_index, datetime]
    return res

def experience_tagging(date_list: list[list]) -> list[tuple]:
    i = 1
    cleaned_section = {}

    while i < len(date_list):

        prev = date_list[i-1] # previous date
        cur = date_list[i] # current date
        
        # if current date is within 10 characters from the previous date
        if cur[0] < prev[1] + 10:
            # Taking ceiling of the year of experience
            key = ((cur[2] - prev[2]).days // 365) + 1
            
            # section starts at the (end index of the current date) + 1
            frm = cur[1]+1

            if i < len(date_list) - 1:
                # if there is another date that appears later, then the section will be until the start index of the next date
                until = date_list[i+1][0]
            else:
                # else the section will be until the end of the resume
                until = -1
            
            # Multiple projects with same year of experience, we do 'chaining' here
            if key in cleaned_section:
                cleaned_section[key].append((frm, until))
            else:
                cleaned_section[key] = [(frm, until)]
            i += 2

        else:
            # ignore the current date, possibly it is useless
            i += 1

    return cleaned_section

def skills_experience_level_identification(resume: str, skills_api: set, clean: bool=True) -> dict:
    res = {}
    # break down resume string into sections
    sections = section_break(resume)

    for section in sections:
        date_list = date_search(section)
        experience_sec = experience_tagging(date_list)
        for key in experience_sec:
            # for each section (`start` to `end`) that has the year of experience of `key`
            for start, end in experience_sec[key]:
                # find all the skills within the section
                skills_list = extract_skills(section[start:end], skills_api, clean)
                # for each skill, tag the maximum year of experience
                for ele in skills_list:
                    if ele not in res:
                        res[ele] = key
                    else:
                        res[ele] = max(key, res[ele])

    # for all the skills which do not have any level of experience, assign a default value of 1
    skills_list = extract_skills(resume, skills_api, True)
    for ele in skills_list:
        if ele not in res:
            res[ele] = 1

    # return dictionary with sorted keys, `key` is the skill. `value` is the year of experience
    return dict(sorted(res.items(), key = lambda x:x[1], reverse = True))

def get_significance_table(df: pd.DataFrame, skills_dic: dict) -> pd.DataFrame:
    
    # creates a binary matrix indicating the presence of each skill
    df['Skills'] = df['ID'].apply(lambda x: list(skills_dic[x]))
    mlb = MultiLabelBinarizer()
    table = pd.DataFrame(mlb.fit_transform(df['Skills']),
                         columns=mlb.classes_,
                         index=df['Skills'].index)
    
    # add category column as y
    y = df['Category']
    table['y'] = y

    # sum by category column and divide by total number of instances
    agg_table = table.groupby(['y']).sum()
    agg_table = agg_table.T / table.groupby(['y']).size()

    # return a panda dataframe that has skills as rows, industry/job role as column
    return agg_table

def find_significant_skills(agg_table: pd.DataFrame) -> dict:

    # gauge skill levels according to percentiles
    skills_required = {}
    for col in agg_table.columns:
        # we only apply percentile method on skills that appear at least once
        skills = agg_table[col][agg_table[col] > 0]

        # no skills extracted > skills required = empty dictionary
        if len(skills) == 0: 
            skills_required[col] = dict()
        
        # some skills extracted > skills required = some dictionary
        else:
            series = agg_table[col][agg_table[col] >= np.percentile(skills, 95)]

            # if all skills above 95th percentile have same frequency, then scale them to 1 (max)
            if len(series.unique()) == 1:
                scaled_series = series.apply(lambda x: 1)
            # if skills above 95th percentile have different frequency, then scale to values between 0 and 1
            else:
                scaled_series = series.apply(lambda x: (x - series.min()) / (series.max() - series.min()))

            # bin skills according to percentiles
            binned_series = scaled_series.apply(lambda x: 5 if x > 0.7 else 4 if x > 0.3 else 3)
            
            # convert series to dictionary form
            skills_required[col] = binned_series.to_dict()

    # return a dictionary which `key` is the industry/job role and `value` is the dictionary with the corresponding competency level for each skill
    return skills_required

def create_skills_required_dictionary(df: pd.DataFrame, skills_api: set,clean: bool=True) -> dict:
    df["Skills"] = df["Text"].apply(lambda x: extract_skills(x,skills_api,clean))
    skills_dic = {}
    for index, row in df.iterrows():
        skills_dic[row.ID] = row.Skills
    table = get_significance_table(df,skills_dic)
    skills_required_dic = find_significant_skills(table)
    return skills_required_dic

# Map years of experience to competency level
def user_level_deduction(years: int) -> int:
    if years <= 2:
        return 3
    elif years <= 5:
        return 4
    else:
        return 5

# return skills gap grouped by skills
def skills_gap_identification(skills: dict, skills_required: dict) -> dict:
    diff = {}
    # compare the skill required vs the skill from resumes
    for key in skills_required:
        # if the applicant does not have the skill, then he needs to start picking up from level 3
        if key not in skills:
            diff[key] = [x for x in range(3, skills_required[key] + 1)]
        # if the applicant has the skill, find what is his competency level, then suggest him all the subsequent competency levels 
        else:
            user_level = user_level_deduction(skills[key])
            if user_level < skills_required[key]:
                diff[key] = [x for x in range(user_level + 1, skills_required[key] + 1)]
    # return a dictionary with `key` as the skill that need to be bridged and `value` as the difference in competency levels
    return diff

# group skills gap by level
def skills_gap_by_level(skills_gap: dict) -> dict:
    new_skills_gap = {}
    for skill in skills_gap:
        for level in skills_gap[skill]:
            if level in new_skills_gap:
                new_skills_gap[level].append(skill)
            else:
                new_skills_gap[level] = [skill]
    return new_skills_gap

def create_courses_dataset(courses_dataset_filename: str) -> pd.DataFrame:
    courses = pd.read_excel(courses_dataset_filename)
    courses = courses.fillna("")
    courses['Description'] = courses['jobFamily'] + " " \
                            + courses['Marketing Name'] + " " \
                            + courses['courseName'] + " " \
                            + courses ['moduleName'] + " " \
                            + courses['courseDesc'] + " " \
                            + courses['Outcome Description'] + " " \
                            + courses['competencyUnitDesc']
    courses = courses[['productId', 'Marketing Name', 'Description', 'jobFamily', 'competencyLevel']]
    courses['Description'] = courses['Description'].astype(str)
    courses['Description'] = courses['Description'].apply(preprocess)
    return courses

def tagcol_paragraph_embeddings_features(train_data: pd.DataFrame) -> list[TaggedDocument]:

    # Expects a dataframe with a 'Description' column
    train_data_values = train_data['Description'].values
    
    # Remember to use token.text to get the raw string, otherwise doc2vec cannot build vocabulary
    columns = [TaggedDocument([token.text for token in nlp(text) if token is not token.is_stop] , [i]) for i, text in enumerate(train_data_values)]
    
    return columns

def train_d2v_model(courses_dataset: pd.DataFrame) -> Doc2Vec:
    corpus = tagcol_paragraph_embeddings_features(courses_dataset)
    model = Doc2Vec(dm=0, vector_size=50, workers=multiprocessing.cpu_count(), min_count=2, epochs=100, hs=1, negative=0)
    model.build_vocab(corpus)
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return model

def course_suggestion_d2v(doc2vec_model: Doc2Vec, skills_gap_cand: dict, skills_gap_jobs: dict, courses_dataset: pd.DataFrame) -> dict:
    ans = {'job': {}, 'continuous learning': {}}

    for level in skills_gap_jobs:
        vector = doc2vec_model.infer_vector(skills_gap_jobs[level])
        res = doc2vec_model.dv.most_similar([vector], topn=20)
        course_unique = set()
        course_list = []
        for i, prob in res:
            if courses_dataset.loc[i, 'competencyLevel'][0] == str(level) and courses_dataset.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses_dataset.loc[i, 'Marketing Name'])
                # course_list.append((courses_dataset.loc[i, 'productId'], courses_dataset.loc[i, 'Marketing Name'], courses_dataset.loc[i, 'competencyLevel']))
                course_list.append({
                    "Course ID": courses_dataset.loc[i, 'productId'],
                    "Course Name": courses_dataset.loc[i, 'Marketing Name'],
                    "Couse Level": courses_dataset.loc[i, 'competencyLevel']
                })
        ans['job'][level] = course_list[:5]

    for level in skills_gap_cand:
        vector = doc2vec_model.infer_vector(skills_gap_cand[level])
        res = doc2vec_model.dv.most_similar([vector], topn=20)
        course_unique = set()
        course_list = []
        for i, prob in res:
            if courses_dataset.loc[i, 'competencyLevel'][0] == str(level) and courses_dataset.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses_dataset.loc[i, 'Marketing Name'])
                # course_list.append((courses_dataset.loc[i, 'productId'], courses_dataset.loc[i, 'Marketing Name'], courses_dataset.loc[i, 'competencyLevel']))
                course_list.append({
                    "Course ID": courses_dataset.loc[i, 'productId'],
                    "Course Name": courses_dataset.loc[i, 'Marketing Name'],
                    "Couse Level": courses_dataset.loc[i, 'competencyLevel']
                })
        ans['continuous learning'][level] = course_list[:5]

    return ans

def course_suggestion_spacy(spacy_model: spacy.lang, skills_gap_cand: dict, skills_gap_jobs: dict, courses_dataset: pd.DataFrame) -> dict:
    ans = {'job': {}, 'continuous learning': {}}

    for level in skills_gap_jobs:
        skills_gap_text = " ".join(skills_gap_jobs[level])

        # get courses of same competency level
        df = courses_dataset[courses_dataset["competencyLevel"].str.contains(str(level))]
        df = df.copy()

        # get similarity score
        df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))
        top_courses = df.nlargest(20,'Similarity',keep='all')
        course_unique = set()
        course_list = []
        for index, row in top_courses.iterrows():
            if row['Marketing Name'] not in course_unique:
                course_unique.add(row['Marketing Name'])
                # course_list.append((row['productId'], row['Marketing Name'], row['competencyLevel']))
                course_list.append({
                    "Course ID": row['productId'],
                    "Course Name": row['Marketing Name'],
                    "Couse Level": row['competencyLevel']
                })
        ans['job'][level] = course_list[:5]

    for level in skills_gap_cand:
        skills_gap_text = " ".join(skills_gap_cand[level])

        # get courses of same competency level
        df = courses_dataset[courses_dataset["competencyLevel"].str.contains(str(level))]
        df = df.copy()

        # get similarity score
        df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))
        top_courses = df.nlargest(20,'Similarity',keep='all')
        course_unique = set()
        course_list = []
        for index, row in top_courses.iterrows():
            if row['Marketing Name'] not in course_unique:
                course_unique.add(row['Marketing Name'])
                # course_list.append((row['productId'], row['Marketing Name'], row['competencyLevel']))
                course_list.append({
                    "Course ID": row['productId'],
                    "Course Name": row['Marketing Name'],
                    "Couse Level": row['competencyLevel']
                })
        ans['continuous learning'][level] = course_list[:5]

    return ans

### 6.3 Pre-Trained Resources for Live Usage

In [15]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# Tokenize words only with the whitespace rule
# N-grams will no longer be treated as 'N' and '-grams'
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

# this section is only called when EMSI Skills API is updated
skills_api = create_skills_api('all_skills_emsi.xlsx')
with open('skills_api.pkl', 'wb') as handle:
  pickle.dump(skills_api, handle)
print("---------------------Skills API successfully created---------------------------")

# this section is only called when there are new resumes
df1 = pd.read_csv('Resume.csv')
df1['Text'] = df1['Resume_str']
df1 = df1[['ID','Category','Text']]
resume_skills_required = create_skills_required_dictionary(df1,skills_api,True)
with open('resume_skills_required.pkl', 'wb') as handle:
  pickle.dump(resume_skills_required, handle)
print("----------------resume_skills_required.pkl successfully created----------------")

# this section is only called when there are new job postings
df2 = pd.read_csv('Job Description.csv')
df2 = df2[df2['job_post_lang'].str.lower() == 'en-gb'] 
df2['ID'] = df2['uniq_id']
df2['Category'] = df2['category']
df2['Text'] = df2['job_requirements']
df2 = df2[['ID','Category','Text']]
df2 = df2.dropna().reset_index(drop=True)
job_skills_required = create_skills_required_dictionary(df2,skills_api,True)
with open('job_skills_required.pkl', 'wb') as handle:
  pickle.dump(job_skills_required, handle)
print("------------------job_skills_required.pkl successfully created------------------")

# this section is only called when there are new courses provided by Sambaash
courses_dataset = create_courses_dataset('Courses.xlsx')
doc2vec_model = train_d2v_model(courses_dataset)
doc2vec_model.save("d2v_model.pkl")
print("-------------------Doc2Vec model successfully created-------------------")


----------------resume_skills_required.pkl successfully created----------------
---------------job_skills_required.pkl successfully created----------------
---------------Doc2Vec model successfully created----------------


In [18]:
# this encoder class is to encode the integer, float and integer array in JSON output
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

### 6.4 High-level Extraction for The Final Function

In [19]:
def final_course_suggestion_d2v(resume_text: str, peer_industry: str, job_name: str, skills_api: set, 
                                                                      resume_skills_required_pickle: dict, 
                                                                      job_skills_required_pickle: dict, 
                                                                      courses_dataset: pd.DataFrame, 
                                                                      doc2vec_model: Doc2Vec, 
                                                                      clean: bool=True):
    """
    This function returns list of courses corresponding to different competency levels for both job requirements and continuous learning.

    Parameters
    ----------
    resume_text : string
        The extracted text from a resume.
    peer_industry: string
        The industry that you want to compare yourself with. 
    job_name: string
        The particular role that applicants are applying to.
    skills_api : set
        A set that contains all the recognized skills. Created based on EMSI skills API.
    resume_skills_required_pickle: dictionary
        A dictionary that stores all the good-to-have skills and the corresponding competency levels for each industry.
    job_skills_required_pickle: dictionary
        A dictionary that stores all the job specific skills and the corresponding competency levels for each role.
    courses_dataset: pd.DataFrame
        The dataset that contains all the information of courses. Provided by Sambaash.
    doc2vec_model: doc2vec model
        The pretrained doc2vec model to compare the similarity between skills required and courses description.
    clean: boolean
        The boolean that indicates whether we shall clean the text.
      

    Returns
    -------
    dict (a nested dictionary that contains all the courses information)

    See Also
    --------
    final_course_suggestion_spacy : A similar method, but it used spacy pre-trained model instead of doc2vec model

    Examples
    --------
    >>> final_course_suggestion_d2v(resume_text, 'INFORMATION-TECHNOLOGY', 'Software Developers, Applications',
                                    skills_api, resume_skills_required_pickle, job_skills_required_pickle, courses_dataset, doc2vec_model, True)
    {'job': 
        {
            3: [(7203,'Innovation and Entrepreneurship Capstone','3 - Entrant Level'), ...],
            4: [(4602, 'Express Data Base Administrator', '4 - Specialist Level'), ...],
            5: [(12502, 'Cyber Security Management Capstone Project','5 - Expert Level'), ...]
        }
     'continuous learning':
        {
            3: [(7203,'Innovation and Entrepreneurship Capstone','3 - Entrant Level'), ...],
            4: [(4602, 'Express Data Base Administrator', '4 - Specialist Level'), ...],
            5: [(12502, 'Cyber Security Management Capstone Project','5 - Expert Level'), ...]
        }
    }
    """

    if peer_industry not in resume_skills_required_pickle:
        return 'Industry Not Found'
    if job_name not in job_skills_required_pickle:
        return 'Job Not Found'
    
    my_resume_skills = skills_experience_level_identification(resume_text, skills_api, clean)

    skills_gap_cand = skills_gap_by_level(skills_gap_identification(my_resume_skills, resume_skills_required_pickle[peer_industry]))
    skills_gap_jobs = skills_gap_by_level(skills_gap_identification(my_resume_skills, job_skills_required_pickle[job_name]))
    
    return course_suggestion_d2v(doc2vec_model, skills_gap_cand, skills_gap_jobs, courses_dataset)




def final_course_suggestion_spacy(resume_text: str, peer_industry: str, job_name: str, skills_api: set, 
                                                                      resume_skills_required_pickle: dict, 
                                                                      job_skills_required_pickle: dict, 
                                                                      courses_dataset: pd.DataFrame, 
                                                                      spacy_model: spacy.lang.en.English, 
                                                                      clean: bool=True):
    """
    This function returns list of courses corresponding to different competency levels for both job requirements and continuous learning.

    Parameters
    ----------
    resume_text : string
        The extracted text from a resume.
    peer_industry: string
        The industry that you want to compare yourself with.
    job_name: string
        The particular role that applicants are applying to.
    skills_api : set
        A set that contains all the recognized skills. Created based on EMSI skills API.
    resume_skills_required_pickle: dictionary
        A dictionary that stores all the good-to-have skills and the corresponding competency levels for each industry.
    job_skills_required_pickle: dictionary
        A dictionary that stores all the job specific skills and the corresponding competency levels for each role.
    courses_dataset: pd.DataFrame
        The dataset that contains all the information of courses. Provided by Sambaash.
    spacy_model: spacy pre-trained model
        The pretrained spacy model to compare the similarity between skills required and courses description.
    clean: boolean
        The boolean that indicates whether we shall clean the text.
      

    Returns
    -------
    dict (a nested dictionary that contains all the courses information)

    See Also
    --------
    final_course_suggestion_d2v: A similar method, but it used doc2vec model instead of spacy pre-trained model

    Examples
    --------
    >>> final_course_suggestion_spacy(resume_text, 'INFORMATION-TECHNOLOGY', 'Software Developers, Applications',
                                      skills_api, resume_skills_required_pickle, job_skills_required_pickle, courses_dataset, spacy_model, True)
    {'job': 
        {
            3: [(7203,'Innovation and Entrepreneurship Capstone','3 - Entrant Level'), ...],
            4: [(4602, 'Express Data Base Administrator', '4 - Specialist Level'), ...],
            5: [(12502, 'Cyber Security Management Capstone Project','5 - Expert Level'), ...]
        }
     'continuous learning':
        {
            3: [(7203,'Innovation and Entrepreneurship Capstone','3 - Entrant Level'), ...],
            4: [(4602, 'Express Data Base Administrator', '4 - Specialist Level'), ...],
            5: [(12502, 'Cyber Security Management Capstone Project','5 - Expert Level'), ...]
        }
    }
    """
    
    if peer_industry not in resume_skills_required_pickle:
        return 'Industry Not Found'
    if job_name not in job_skills_required_pickle:
        return 'Job Not Found'
    
    my_resume_skills = skills_experience_level_identification(resume_text, skills_api, clean)

    skills_gap_cand = skills_gap_by_level(skills_gap_identification(my_resume_skills, resume_skills_required_pickle[peer_industry]))
    skills_gap_jobs = skills_gap_by_level(skills_gap_identification(my_resume_skills, job_skills_required_pickle[job_name]))
    
    return course_suggestion_spacy(spacy_model, skills_gap_cand, skills_gap_jobs, courses_dataset)

#### Test Run

In [20]:
%%time
resume_text = read_resume('11592605.pdf')
courses_dataset = create_courses_dataset('Courses.xlsx')

with open('skills_api.pkl', 'rb') as pickle_file:
    skills_api = pickle.load(pickle_file)

with open('job_skills_required.pkl', 'rb') as pickle_file:
    job_skills_required_pickle = pickle.load(pickle_file)

with open('resume_skills_required.pkl', 'rb') as pickle_file:
    resume_skills_required_pickle = pickle.load(pickle_file)

with open('d2v_model.pkl', 'rb') as pickle_file:
    doc2vec_model = pickle.load(pickle_file)

spacy_model = spacy.load('en_core_web_sm')

CPU times: user 1min, sys: 197 ms, total: 1min
Wall time: 1min 1s


In [21]:
%%time
final_course_suggestion_d2v(resume_text, 'INFORMATION-TECHNOLOGY', 'Software Developers, Applications',
                            skills_api, resume_skills_required_pickle, job_skills_required_pickle, courses_dataset, doc2vec_model, True)

CPU times: user 328 ms, sys: 22.8 ms, total: 350 ms
Wall time: 381 ms


{'job': {3: [{'Course ID': 1401.0,
    'Course Name': 'NICF - Advanced Certificate in Web Development',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 3504.0,
    'Course Name': 'Express Web Developer (JAVA)',
    'Couse Level': '3 - Entrant Level'}],
  4: [{'Course ID': 2009.0,
    'Course Name': 'NICF - Professional Diploma in Web Development',
    'Couse Level': '4 - Specialist Level'},
   {'Course ID': 2005.0,
    'Course Name': 'Advanced Certificate in Web Development (E-Learning)',
    'Couse Level': '4 - Specialist Level'},
   {'Course ID': 13705.0,
    'Course Name': 'Applied Degree in Software Engineering Zhangzhou Institute of Science & Technology',
    'Couse Level': '4 - Specialist Level'}],
  5: []},
 'continuous learning': {3: [{'Course ID': 8702.0,
    'Course Name': 'Agile Product Management (Implementation)',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 2901.0,
    'Course Name': 'Capstone Project using Java (SF)',
    'Couse Level': '3 - Entran

In [22]:
json.dumps(final_course_suggestion_d2v(resume_text, 'INFORMATION-TECHNOLOGY', 'Software Developers, Applications',
                                       skills_api, resume_skills_required_pickle, job_skills_required_pickle, courses_dataset, doc2vec_model, True), cls=NpEncoder)

'{"job": {"3": [{"Course ID": 1401.0, "Course Name": "NICF - Advanced Certificate in Web Development", "Couse Level": "3 - Entrant Level"}, {"Course ID": 3504.0, "Course Name": "Express Web Developer (JAVA)", "Couse Level": "3 - Entrant Level"}], "4": [{"Course ID": 2009.0, "Course Name": "NICF - Professional Diploma in Web Development", "Couse Level": "4 - Specialist Level"}, {"Course ID": 2005.0, "Course Name": "Advanced Certificate in Web Development (E-Learning)", "Couse Level": "4 - Specialist Level"}], "5": []}, "continuous learning": {"3": [{"Course ID": 8702.0, "Course Name": "Agile Product Management (Implementation)", "Couse Level": "3 - Entrant Level"}, {"Course ID": 7319.0, "Course Name": "Database Design and Implementation", "Couse Level": "3 - Entrant Level"}, {"Course ID": 2901.0, "Course Name": "Capstone Project using Java (SF)", "Couse Level": "3 - Entrant Level"}], "4": [{"Course ID": 7323.0, "Course Name": "Advanced Network", "Couse Level": "4 - Specialist Level"}, {

In [38]:
%%time
final_course_suggestion_spacy(resume_text, 'INFORMATION-TECHNOLOGY', 'Software Developers, Applications',
                              skills_api, resume_skills_required_pickle, job_skills_required_pickle, courses_dataset, spacy_model, True)

  df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))
  df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))


Wall time: 1min 25s


{'job': {3: [{'Course ID': 11217.0,
    'Course Name': 'ACWD-CAT B 0921A',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 14302.0,
    'Course Name': 'New Hire and Redeployment – Full Stack Web Developer',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 1401.0,
    'Course Name': 'NICF - Advanced Certificate in Web Development',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 9517.0,
    'Course Name': 'Advanced Certificate in Infocomm Technology (Software and Applications)',
    'Couse Level': '3 - Entrant Level'},
   {'Course ID': 11302.0,
    'Course Name': 'Diploma in Software Engineering',
    'Couse Level': '3 - Entrant Level'}],
  4: [{'Course ID': 4401.0,
    'Course Name': 'Data Analytics with Hadoop',
    'Couse Level': '4 - Specialist Level'},
   {'Course ID': 9601.0,
    'Course Name': 'Applied Master in Analytics & Artificial Intelligence',
    'Couse Level': '4 - Specialist Level'},
   {'Course ID': 4504.0,
    'Course Name': 'Pearson BTEC H

In [39]:
json.dumps(final_course_suggestion_spacy(resume_text, 'INFORMATION-TECHNOLOGY', 'Software Developers, Applications',
                                         skills_api, resume_skills_required_pickle, job_skills_required_pickle, courses_dataset, spacy_model, True), cls=NpEncoder)

  df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))
  df["Similarity"] = df["Description"].apply(lambda x: spacy_model(skills_gap_text).similarity(spacy_model(str(x))))


'{"job": {"3": [{"Course ID": 11217.0, "Course Name": "ACWD-CAT B 0921A", "Couse Level": "3 - Entrant Level"}, {"Course ID": 14302.0, "Course Name": "New Hire and Redeployment \\u2013 Full Stack Web Developer", "Couse Level": "3 - Entrant Level"}, {"Course ID": 1401.0, "Course Name": "NICF - Advanced Certificate in Web Development", "Couse Level": "3 - Entrant Level"}, {"Course ID": 9517.0, "Course Name": "Advanced Certificate in Infocomm Technology (Software and Applications)", "Couse Level": "3 - Entrant Level"}, {"Course ID": 11302.0, "Course Name": "Diploma in Software Engineering", "Couse Level": "3 - Entrant Level"}], "4": [{"Course ID": 4401.0, "Course Name": "Data Analytics with Hadoop", "Couse Level": "4 - Specialist Level"}, {"Course ID": 9601.0, "Course Name": "Applied Master in Analytics & Artificial Intelligence", "Couse Level": "4 - Specialist Level"}, {"Course ID": 4504.0, "Course Name": "Pearson BTEC HND Level 5 Diploma in Computing (RQF)", "Couse Level": "4 - Specialis