In [2]:
import pandas as pd
import numpy as np

#File manipulation
import os

#Text processing and cleaning
import contractions # To include english contractions
import re #regex
import string #used to include punctuation during text processing
from collections import Counter #count strings in texts

#Natural Language Tool Kit NLK package
import nltk
from nltk.corpus import stopwords #Stopwords
from nltk.tokenize  import sent_tokenize ,  word_tokenize # Word and sentence tokenizer
from nltk import pos_tag, ngrams #N-grams analysis
from nltk.stem import PorterStemmer, WordNetLemmatizer #Lemmatizer and Stemmer
from nltk.text import Text #for concordance
from nltk.collocations import *
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder #collocations
from nltk import BigramAssocMeasures, TrigramAssocMeasures  # Measures for evaluating bigram associations
from nltk import bigrams # Generate bigrams from text data


#Topic modeling/ Clustering
from sklearn.feature_extraction.text import CountVectorizer # Bag-of-Words model
from sklearn.decomposition import LatentDirichletAllocation #LDA for topic modeling
from sklearn.feature_extraction.text import TfidfVectorizer #term-frequency inverse document frequency vectorizer
from scipy.cluster.hierarchy import dendrogram, linkage #Hierarchical clustering
from sklearn.metrics import homogeneity_score, completeness_score #for quality of cluestering

In [3]:
#pip install contractions

In [11]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rodzaraya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rodzaraya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rodzaraya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:

#path = '/content/training_data.csv'
path = '../data/job_descriptions/training_data.csv'
df = pd.read_csv(path)
df

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."
...,...,...,...,...,...
848,Menards,job description\n\nparttime\n\nmake big money ...,Management Internship,1122,"{\n ""Core Responsibilities"": ""Responsibiliti..."
849,Parker,responsibilities\nparkers internship program w...,Human Resources Internship - Corporate (Year-...,3840,"{\n ""Core Responsibilities"": ""Assist in gene..."
850,Borgen Project,the borgen project is an innovative national ...,Writer / Journalist Internship,897,"{\n ""Core Responsibilities"": ""Write one arti..."
851,Wyndham Destinations,put the world on vacation\n\nat wyndham destin...,Inbound Customer Service / Sales (Remote),4604,"{\n ""Core Responsibilities"": ""Answer inbound..."


In [7]:
df.model_response[0]

' {\n  "Core Responsibilities": "Responsible for expanding Google Workspace product adoption across an assigned territory. Build relationships with customers to understand needs and provide Google Workspace solutions. Partner with account teams to construct solutions and grow business for Google Workspace.",\n  "Required Skills": "Bachelor\'s degree or equivalent experience. Experience managing enterprise SaaS accounts and sales cycles.", \n  "Educational Requirements": "Bachelor\'s degree or equivalent experience.",\n  "Experience Level": "Experience managing enterprise SaaS accounts and sales cycles.",\n  "Preferred Qualifications": "Experience building strategic partnerships with enterprise customers. Ability to work through a reseller ecosystem. Excellent communication and strategic thinking skills.",\n  "Compensation and Benefits": "N/A"\n}'

In [8]:
class TextPreprocessor:
    def __init__(self, processing_mode='none', custom_punctuation=None, custom_stopwords=None, sentence_analysis=False):
        """
            Initialization considers Custom punctuation, Stop words, and Lemmatizer or Stemmer.
            Updates custom punctuation and custom stop words set with additional ones if provided.
            The processing mode to standardise variants can be choose between none, Stem and Lemma. Each mode is stored in a different
            column of the dataframe.
            Sentence analysis parameter is used to keep the punctuation symbols required for sentence analysis.

            Parameters:
            - processing_mode: String to decide whether to use 'lemma', 'stem', or 'none' for text processing.
            - custom_punctuation: Additional punctuation characters to remove from text.
            - custom_stopwords: Additional stopwords to remove from text.
            - sentence_analysis: Boolean indicating sentence analysis cleaning steps. This mode will keep the punctuation symbols.

            """

        self.punctuation = string.punctuation #Init with all punctuation characters

        if custom_punctuation:
            self.punctuation += custom_punctuation #add custom punctuation

        self.stop_words = set(stopwords.words('english'))
        if custom_stopwords:
            self.stop_words.update(custom_stopwords) #add custom stopwords

        # Determine which text processing mode to use
        self.processing_mode = processing_mode.lower()

        # Set the sentence analysis mode
        self.sentence_analysis = sentence_analysis

        #Set the variant standardization mode
        if self.processing_mode == 'lemma':
            self.lemmatizer = WordNetLemmatizer()
        elif self.processing_mode == 'stem':
            self.stemmer = PorterStemmer()

    # Expand contractions using the contractions library
    def expand_contractions(self, text):
        return contractions.fix(text)

    # Split hyphenated words into separate words, like phone numbers or radio fm, age, etc.
    def split_hyphenated_words(self, text):
        return re.sub(r'-', ' ', text)

    def remove_punctuation(self, text):
        return ''.join([char for char in text if char not in self.punctuation])

    def add_space_after_parenthesis(self, text):
        return re.sub(r'\)', ') ', text)

    def to_lowercase(self, text):
        return text.lower()

    def remove_stopwords(self, text):
        words = word_tokenize(text)
        return ' '.join([word for word in words if word not in self.stop_words])

    def remove_extra_whitespace(self, text):
        return re.sub(r'\s+', ' ', text).strip()

    def stem_words(self, text):
        words = word_tokenize(text)
        return ' '.join([self.stemmer.stem(word) for word in words])

    def lemmatize_words(self, text):
        words = word_tokenize(text)
        return ' '.join([self.lemmatizer.lemmatize(word) for word in words])

    # Order matters
    def preprocess(self, text):
        try:
            text = self.expand_contractions(text)
            text = self.split_hyphenated_words(text)
            text = self.add_space_after_parenthesis(text)

            #In case we need to analyse sentences, we will need the punctuations
            if not self.sentence_analysis:
                text = self.remove_punctuation(text)
            text = self.to_lowercase(text)
            #The stopwords are removed if the users wants to standardise variants.
            #If none is selected, the ouput will just perform previous cleaning steps
            if self.processing_mode != 'none':
                text = self.remove_stopwords(text)

            text = self.remove_extra_whitespace(text)

            #Select the processing mode for variants
            if self.processing_mode == 'lemma':
                text = self.lemmatize_words(text)
            elif self.processing_mode == 'stem':
                text = self.stem_words(text)

            return text
        except Exception as e:
            print(f"Error processing text: {text}")
            print(f"Error: {e}")
            return text  # Return original text on failure

    #Apply preprocessing steps to daframe and create a column base on the processing mode
    def preprocess_dataframe(self, df, column_name):
        if not self.sentence_analysis:
            if self.processing_mode == 'lemma':
                df[f'{column_name}_processed_lemma'] = df[column_name].apply(self.preprocess)
            elif self.processing_mode == 'stem':
                df[f'{column_name}_processed_stem'] = df[column_name].apply(self.preprocess)
            else:  # If 'none', apply preprocessing without lemma or stem
                df[f'{column_name}_processed_cleaned'] = df[column_name].apply(self.preprocess)
        else: # Add different processed columns for sentences
            if self.processing_mode == 'lemma':
                df[f'{column_name}_processed_lemma_sent'] = df[column_name].apply(self.preprocess)
            elif self.processing_mode == 'stem':
                df[f'{column_name}_processed_stem_sent'] = df[column_name].apply(self.preprocess)
            else:  # If 'none', apply preprocessing without lemma or stem
                df[f'{column_name}_processed_cleaned_sent'] = df[column_name].apply(self.preprocess)
        return df

In [12]:
text_preprocessor = TextPreprocessor(processing_mode='none')
df = text_preprocessor.preprocess_dataframe(df, 'job_description')
df

Unnamed: 0,company_name,job_description,position_title,description_length,model_response,job_description_processed_cleaned
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo...",minimum qualifications bachelors degree or equ...
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ...",description as an asc you will be highly influ...
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus...",its an amazing time to be joining netflix as w...
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs...",description web designers looking to expand yo...
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo...",at trackfive we have got big goals were on a m...
...,...,...,...,...,...,...
848,Menards,job description\n\nparttime\n\nmake big money ...,Management Internship,1122,"{\n ""Core Responsibilities"": ""Responsibiliti...",job description parttime make big money at men...
849,Parker,responsibilities\nparkers internship program w...,Human Resources Internship - Corporate (Year-...,3840,"{\n ""Core Responsibilities"": ""Assist in gene...",responsibilities parkers internship program wa...
850,Borgen Project,the borgen project is an innovative national ...,Writer / Journalist Internship,897,"{\n ""Core Responsibilities"": ""Write one arti...",the borgen project is an innovative national c...
851,Wyndham Destinations,put the world on vacation\n\nat wyndham destin...,Inbound Customer Service / Sales (Remote),4604,"{\n ""Core Responsibilities"": ""Answer inbound...",put the world on vacation at wyndham destinati...


In [12]:
print(df.job_description_processed_cleaned[0])

minimum qualifications bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles preferred qualifications years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills about the job as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organizations more producti

In [14]:
text_preprocessor = TextPreprocessor(processing_mode='lemma')
df = text_preprocessor.preprocess_dataframe(df, 'job_description')
df

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/rodzaraya/nltk_data'
    - '/Users/rodzaraya/.pyenv/versions/3.9.13/nltk_data'
    - '/Users/rodzaraya/.pyenv/versions/3.9.13/share/nltk_data'
    - '/Users/rodzaraya/.pyenv/versions/3.9.13/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [15]:
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
def get_embeddings(text, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    inputs = tokenizer(str(text), return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().to("cpu").numpy()
    return embeddings

# Select device (MPS for Mac, CUDA for NVIDIA GPUs, CPU as a fallback)
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

device = torch.device("cpu")
print(device)

cpu


In [17]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [18]:
import pickle  # for saving and load models

In [19]:

job_desc_embeddings = np.array([get_embeddings(desc, model_name) for desc in df['job_description_processed_cleaned']]).squeeze()

with open('job_desc_embeddings.pkl', 'wb') as f:
    pickle.dump(job_desc_embeddings, f)



In [None]:
job_desc_embeddings_sent = np.array([get_embeddings(desc, model_name) for desc in df['job_description_processed_cleaned']]).squeeze()

with open('job_desc_embeddings.pkl', 'wb') as f:
    pickle.dump(job_desc_embeddings, f)

In [20]:
import spacy
from spacy.matcher import Matcher
import json

# Load the spaCy model (small English model is sufficient for matching)
nlp = spacy.load("en_core_web_sm")

In [22]:


# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Load patterns from the JSONL file
patterns = []
with open('/content/jz_skill_patterns.jsonl', 'r') as f:
    for line in f:
        patterns.append(json.loads(line))

# Add patterns to the matcher
for pattern in patterns:
    matcher.add(pattern['label'], [pattern['pattern']])

# Define a function to apply the matcher and find skills in the text
def find_skills(text):
    doc = nlp(text)
    matches = matcher(doc)
    skills = set()  # To store found skills
    for match_id, start, end in matches:
        skill = doc[start:end].text
        skills.add(skill)
    return skills

# Apply the find_skills function to the 'Features' column of the DataFrame
df['skills_matched'] = df['job_description_processed_cleaned'].apply(find_skills)

# Display the new column with the found skills
df[['job_description_processed_cleaned', 'skills_matched']]


Unnamed: 0,job_description_processed_cleaned,skills_matched
0,minimum qualifications bachelors degree or equ...,"{business, content management, google, mobile,..."
1,description as an asc you will be highly influ...,{business}
2,its an amazing time to be joining netflix as w...,"{support, business, languages, swift, marketin..."
3,description web designers looking to expand yo...,"{support, content management, accounting, mark..."
4,at trackfive we have got big goals were on a m...,"{support, content management, databases, jquer..."
...,...,...
848,job description parttime make big money at men...,{business}
849,responsibilities parkers internship program wa...,"{support, business, engineering, schedule, pro..."
850,the borgen project is an innovative national c...,{schedule}
851,put the world on vacation at wyndham destinati...,"{schedule, router, business}"


Get the skills from model response for model comparison

In [23]:
def extract_skills_from_json(json_data):
    # Parse the model_response (JSON format in string form)
    parsed = json.loads(json_data)

    # Extract relevant fields: Required Skills, Educational Requirements, etc.
    required_skills = parsed.get("Required Skills", "")
    education_requirements = parsed.get("Educational Requirements", "")
    experience_level = parsed.get("Experience Level", "")

      # Convert any list fields to strings (join lists with spaces)
    if isinstance(required_skills, list):
        required_skills = " ".join(required_skills)

    if isinstance(education_requirements, list):
        education_requirements = " ".join(education_requirements)

    if isinstance(experience_level, list):
        experience_level = " ".join(experience_level)

     #Combine the relevant sections into one string for training
    combined_text = required_skills #+ " " + education_requirements + " " + experience_level

    return combined_text.strip()


# Apply the function to the 'model_response' column to extract text
df['model_required_skills'] = df['model_response'].apply(extract_skills_from_json)

# Display the extracted text
df[['model_response', 'model_required_skills']].head()

Unnamed: 0,model_response,model_required_skills
0,"{\n ""Core Responsibilities"": ""Responsible fo...",Bachelor's degree or equivalent experience. Ex...
1,"{\n ""Core Responsibilities"": ""as an asc you ...",a passion to help people understand how apple ...
2,"{\n ""Core Responsibilities"": ""Help drive bus...",2+ years experience in preferably outbound lic...
3,"{\n ""Core Responsibilities"": ""Designing webs...",2+ years experience in web design. Proficiency...
4,"{\n ""Core Responsibilities"": ""Build and layo...","2+ years of experience with HTML and CSS/SASS,..."


In [24]:
text_preprocessor = TextPreprocessor(processing_mode='none')
df = text_preprocessor.preprocess_dataframe(df, 'model_required_skills')
#df = text_preprocessor.preprocess_dataframe(df, 'top_skills')
df

Unnamed: 0,company_name,job_description,position_title,description_length,model_response,job_description_processed_cleaned,processed_with_sent,skills_matched,model_required_skills,model_required_skills_processed_cleaned
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo...",minimum qualifications bachelors degree or equ...,minimum qualifications bachelors degree equiva...,"{business, content management, google, mobile,...",Bachelor's degree or equivalent experience. Ex...,bachelors degree or equivalent experience expe...
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ...",description as an asc you will be highly influ...,description asc highly influential growing min...,{business},a passion to help people understand how apple ...,a passion to help people understand how apple ...
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus...",its an amazing time to be joining netflix as w...,amazing time joining netflix continue transfor...,"{support, business, languages, swift, marketin...",2+ years experience in preferably outbound lic...,2 years experience in preferably outbound lice...
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs...",description web designers looking to expand yo...,description web designers looking expand profe...,"{support, content management, accounting, mark...",2+ years experience in web design. Proficiency...,2 years experience in web design proficiency w...
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo...",at trackfive we have got big goals were on a m...,trackfive weve got big goals mission revolutio...,"{support, content management, databases, jquer...","2+ years of experience with HTML and CSS/SASS,...",2 years of experience with html and csssass 2 ...
...,...,...,...,...,...,...,...,...,...,...
848,Menards,job description\n\nparttime\n\nmake big money ...,Management Internship,1122,"{\n ""Core Responsibilities"": ""Responsibiliti...",job description parttime make big money at men...,job description parttime make big money menard...,{business},No specific technical or soft skills listed.,no specific technical or soft skills listed
849,Parker,responsibilities\nparkers internship program w...,Human Resources Internship - Corporate (Year-...,3840,"{\n ""Core Responsibilities"": ""Assist in gene...",responsibilities parkers internship program wa...,responsibilities parkers internship program es...,"{support, business, engineering, schedule, pro...","Pursuing Bachelor's degree in HR or business, ...",pursuing bachelors degree in hr or business 2 ...
850,Borgen Project,the borgen project is an innovative national ...,Writer / Journalist Internship,897,"{\n ""Core Responsibilities"": ""Write one arti...",the borgen project is an innovative national c...,borgen project innovative national campaign wo...,{schedule},Strong research and writing skills. Must be ab...,strong research and writing skills must be abl...
851,Wyndham Destinations,put the world on vacation\n\nat wyndham destin...,Inbound Customer Service / Sales (Remote),4604,"{\n ""Core Responsibilities"": ""Answer inbound...",put the world on vacation at wyndham destinati...,,"{schedule, router, business}",6 months of customer service and sales experie...,6 months of customer service and sales experie...


In [26]:
df.model_required_skills_processed_cleaned[10]

'proficiency in html css javascript and basic phpmysql experience building and maintaining web based platforms wordpress knowledge of interaction design fundamentals and web best practices experience with adobe creative suite and audiovideo editing api integration experience'

In [75]:
# Apply the find_skills function to the 'Features' column of the DataFrame
df['model_response_skills_matched'] = df['model_required_skills_processed_cleaned'].apply(find_skills)

# Display the new column with the found skills
df[['model_required_skills_processed_cleaned','model_response_skills_matched', 'skills_matched']]

Unnamed: 0,model_required_skills_processed_cleaned,model_response_skills_matched,skills_matched
0,bachelors degree or equivalent experience experience managing enterprise saas accounts and sales cycles,{},"[business, content management, google, mobile, chrome, collaboration]"
1,a passion to help people understand how apple products can enrich their livesexcellent communication skills allowing you to be as comfortable in front of a small group as you are speaking with individuals years preferred working in a dynamic sales andor results driven environment as well as proven success developing customer loyaltyability to encourage a partner team and grow apple business,{business},[business]
2,2 years experience in preferably outbound licensing understanding of category manufacturing and sales cycles for toysfoodbeverage preferred experience with entertainmentlifestyle brands self starter proactive flexible thrives under pressure superb organizational and multitasking skills excellent communication skills,{},"[support, business, languages, swift, marketing, workflow, schedule, play]"
3,2 years experience in web design proficiency with adobe creative cloud photoshop illustrator indesign strong html css javascript skills familiarity with content management systems,"{content management, design, javascript}","[support, content management, accounting, marketing, adobe photoshop, javascript, mobile, finance, advertising, design, landing pages, testing]"
4,2 years of experience with html and csssass 2 years of experience with programming php applications and lamp stack development experience with es6 javascript and jquery thorough understanding of relational databases and security relating to phpmysql expert knowledge with content management systems either from your own design or from mvc frameworks such as zend laravel etc ability to build and consume custom soap and rest apis 2 years of experience writing unit tests detailing procedures self motivated requiring minimal supervision exceptional organization and communication skills,"{content management, databases, jquery, javascript, laravel, security, design}","[support, content management, databases, jquery, javascript, laravel, software, security, design]"
...,...,...,...
848,no specific technical or soft skills listed,{},[business]
849,pursuing bachelors degree in hr or business 2 years of undergraduate coursework reside within 50 miles of location proficient in ms office customer service skills project management skills,"{project management, business}","[support, business, engineering, schedule, project management, software, testing]"
850,strong research and writing skills must be able to work independently and meet deadlines with very little supervision,{},[schedule]
851,6 months of customer service and sales experience ability to build rapport quickly ask probing questions and meet customers needs intermediate computer knowledge ability to work well under pressure and multitask,{},"[schedule, router, business]"


In [27]:
# Load skills from the patterns file (assuming it’s a JSONL file)
# Load skills from the provided JSON file
def load_skills_from_json(skills_file):
    skills_list = []
    with open(skills_file, 'r') as f:
        data = json.load(f)  # Load the entire JSON file
        for skill in data.keys():  # The keys at the top level are the skill names
            skills_list.append(skill)  # Add the skill to the list
    return skills_list

# Load the skills from your JSON file
skills_list = load_skills_from_json('/content/skills.json')

# Print the loaded skills
print("Skills loaded from JSON:", skills_list)

Skills loaded from JSON: ['.net', '1password', '3d', '3d-reconstruction', 'aboutness', 'abstract-data-type', 'abstract-interpretation', 'abstract-machine', 'access-control', 'access-method', 'access-network', 'accounting', 'active-appearance-model', 'active-database', 'active-networking', 'active-shape-model', 'activemq', 'activity-recognition', 'actuarial-science', 'actuator', 'adaboost', 'adaptive-routing', 'adaptive-system', 'adder', 'adobe-illustrator', 'adobe-photoshop', 'advertising', 'aerial-photography', 'aeronautics', 'aerospace-engineering', 'aerospike', 'agile-project-management', 'agricultural-engineering', 'airflow', 'airtable', 'ajax', 'akamai', 'akka', 'algolia', 'algorithm', 'algorithm-design', 'alpine-linux', 'amazon-api-gateway', 'amazon-athena', 'amazon-cloudfront', 'amazon-cloudwatch', 'amazon-cognito', 'amazon-dynamodb', 'amazon-ebs', 'amazon-ec2', 'amazon-ec2-container-service', 'amazon-eks', 'amazon-elasticache', 'amazon-elasticsearch-service', 'amazon-emr', 'ama

In [28]:
print(len(skills_list))

1991


In [29]:
# Step 3: Generate skill embeddings
def get_skill_embeddings(skills, model, tokenizer):
    skill_embeddings = []
    for skill in skills:
        inputs = tokenizer(skill, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().to("cpu").numpy()
        skill_embeddings.append(embeddings)
    return np.array(skill_embeddings)

# Generate embeddings for the skills
skill_embeddings = get_skill_embeddings(skills_list, model, tokenizer)



In [30]:
# Save embeddings to a pickle file
with open('job_desc_embeddings_skills.pkl', 'wb') as f:
    pickle.dump(skill_embeddings, f)

print("Embeddings saved successfully!")

Embeddings saved successfully!


In [33]:
skill_embeddings = np.squeeze(skill_embeddings, axis=1)

In [61]:
# Step 4: Compute similarity and find top skills with scores for each job description
def find_top_skills(job_desc_embedding, skill_embeddings, skills_list, threshold=0.55):
    # Ensure that job_desc_embedding is 2D before passing to cosine_similarity
    job_desc_embedding = np.expand_dims(job_desc_embedding, axis=0)  # Make it 2D
    similarities = cosine_similarity(job_desc_embedding, skill_embeddings).flatten()

    # Find all skills with similarity scores above the threshold
    above_threshold_indices = [i for i, score in enumerate(similarities) if score >= threshold]

    # Get the skills and scores for those above the threshold
    top_skills = [skills_list[i] for i in above_threshold_indices]
    top_scores = [similarities[i] for i in above_threshold_indices]

    # Return both the skills and their similarity scores
    return list(zip(top_skills, top_scores))


In [62]:
THRESHOLD = 0.55
# Apply the top skills with different THRESHOLD for comparison
df[f'top_skills_with_scores_th_{THRESHOLD}'] = df.apply(lambda row: find_top_skills(job_desc_embeddings[df.index.get_loc(row.name)], skill_embeddings, skills_list, THRESHOLD), axis=1)
THRESHOLD = 0.6
# Apply the top skills with different THRESHOLD for comparison
df[f'top_skills_with_scores_th_{THRESHOLD}'] = df.apply(lambda row: find_top_skills(job_desc_embeddings[df.index.get_loc(row.name)], skill_embeddings, skills_list, THRESHOLD), axis=1)
THRESHOLD = 0.65
# Apply the top skills with different THRESHOLD for comparison
df[f'top_skills_with_scores_th_{THRESHOLD}'] = df.apply(lambda row: find_top_skills(job_desc_embeddings[df.index.get_loc(row.name)], skill_embeddings, skills_list, THRESHOLD), axis=1)
THRESHOLD = 0.7
# Apply the top skills with different THRESHOLD for comparison
df[f'top_skills_with_scores_th_{THRESHOLD}'] = df.apply(lambda row: find_top_skills(job_desc_embeddings[df.index.get_loc(row.name)], skill_embeddings, skills_list, THRESHOLD), axis=1)


In [65]:
df[['model_required_skills_processed_cleaned', 'top_skills_with_scores_th_0.6']].head()

Unnamed: 0,model_required_skills_processed_cleaned,top_skills_with_scores_th_0.6
0,bachelors degree or equivalent experience experience managing enterprise saas accounts and sales cycles,"[(mobile-prototyping--interaction-design-tools, 0.62559235)]"
1,a passion to help people understand how apple products can enrich their livesexcellent communication skills allowing you to be as comfortable in front of a small group as you are speaking with individuals years preferred working in a dynamic sales andor results driven environment as well as proven success developing customer loyaltyability to encourage a partner team and grow apple business,"[(mobile-prototyping--interaction-design-tools, 0.60625976)]"
2,2 years experience in preferably outbound licensing understanding of category manufacturing and sales cycles for toysfoodbeverage preferred experience with entertainmentlifestyle brands self starter proactive flexible thrives under pressure superb organizational and multitasking skills excellent communication skills,[]
3,2 years experience in web design proficiency with adobe creative cloud photoshop illustrator indesign strong html css javascript skills familiarity with content management systems,[]
4,2 years of experience with html and csssass 2 years of experience with programming php applications and lamp stack development experience with es6 javascript and jquery thorough understanding of relational databases and security relating to phpmysql expert knowledge with content management systems either from your own design or from mvc frameworks such as zend laravel etc ability to build and consume custom soap and rest apis 2 years of experience writing unit tests detailing procedures self motivated requiring minimal supervision exceptional organization and communication skills,"[(mobile-prototyping--interaction-design-tools, 0.6078938)]"


In [67]:
df[df['top_skills_with_scores_th_0.55'].apply(lambda x: len(x) == 0)].shape[0]

191

In [66]:
df[df['top_skills_with_scores_th_0.6'].apply(lambda x: len(x) == 0)].shape[0]

709

In [68]:
df[df['top_skills_with_scores_th_0.65'].apply(lambda x: len(x) == 0)].shape[0]

834

In [41]:
len(df)

853

In [47]:
pd.set_option('display.max_colwidth', None)

In [69]:
#Split the list of skills and scores for each set
df['top_skills_th_0.55'] = df['top_skills_with_scores_th_0.55'].apply(lambda x: [skill for skill, score in x])
df['top_scores_th_0.55'] = df['top_skills_with_scores_th_0.55'].apply(lambda x: [score for skill, score in x])

#format the skills without the -
df['top_skills_th_0.55'] = df['top_skills_th_0.55'].apply(lambda skills: [skill.replace('-', ' ') for skill in skills])

#Split the list of skills and scores for each set
df['top_skills_th_0.6'] = df['top_skills_with_scores_th_0.6'].apply(lambda x: [skill for skill, score in x])
df['top_scores_th_0.6'] = df['top_skills_with_scores_th_0.6'].apply(lambda x: [score for skill, score in x])

#format the skills without the -
df['top_skills_th_0.6'] = df['top_skills_th_0.6'].apply(lambda skills: [skill.replace('-', ' ') for skill in skills])

#Split the list of skills and scores for each set
df['top_skills_th_0.65'] = df['top_skills_with_scores_th_0.65'].apply(lambda x: [skill for skill, score in x])
df['top_scores_th_0.65'] = df['top_skills_with_scores_th_0.65'].apply(lambda x: [score for skill, score in x])

#format the skills without the -
df['top_skills_th_0.65'] = df['top_skills_th_0.65'].apply(lambda skills: [skill.replace('-', ' ') for skill in skills])


In [71]:
df[['top_skills_th_0.6', 'skills_matched']].head()

Unnamed: 0,top_skills_th_0.6,skills_matched
0,[mobile prototyping interaction design tools],"[business, content management, google, mobile, chrome, collaboration]"
1,[mobile prototyping interaction design tools],[business]
2,[],"[support, business, languages, swift, marketing, workflow, schedule, play]"
3,[],"[support, content management, accounting, marketing, adobe photoshop, javascript, mobile, finance, advertising, design, landing pages, testing]"
4,[mobile prototyping interaction design tools],"[support, content management, databases, jquery, javascript, laravel, software, security, design]"


In [57]:
df['skills_matched'] = df['skills_matched'].apply(lambda x: list(x))

In [72]:
df['total_skills_th_0.55'] = df.apply(lambda row: list(set(row['top_skills_th_0.55'] + row['skills_matched'])), axis=1)
df['total_skills_th_0.6'] = df.apply(lambda row: list(set(row['top_skills_th_0.6'] + row['skills_matched'])), axis=1)
df['total_skills_th_0.65'] = df.apply(lambda row: list(set(row['top_skills_th_0.65'] + row['skills_matched'])), axis=1)

In [74]:
df[['total_skills_th_0.6']].iloc[3]

Unnamed: 0,3
total_skills_th_0.6,"[support, content management, accounting, marketing, adobe photoshop, javascript, mobile, finance, advertising, design, landing pages, testing]"


# **Evaluation**

In [76]:
df.to_csv('processed_df.csv',sep=';')

In [77]:
#spacy

from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc


In [78]:
# Function to extract skills from text using the custom entity ruler
def get_skills(text):
    doc = nlp(text)
    skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
    return skills

# Ensure unique skills
def unique_skills(skill_list):
    return list(set(skill_list))


In [79]:

# Create an EntityRuler component using add_pipe with its name
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Load the patterns from a file (assuming 'patterns' is a valid path to the JSONL file)
ruler.from_disk('/content/jz_skill_patterns.jsonl')

# Check the pipeline components to verify the entity_ruler is added
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'entity_ruler', 'ner']


In [80]:

# Apply the function to the 'Extracted_Text' column
df["model_response_skills_matched"] = df["model_required_skills_processed_cleaned"].apply(get_skills)

# Show the result
df[['model_required_skills_processed_cleaned','model_response_skills_matched', 'skills_matched']].head()

Unnamed: 0,model_required_skills_processed_cleaned,model_response_skills_matched,skills_matched
0,bachelors degree or equivalent experience experience managing enterprise saas accounts and sales cycles,[],"[business, content management, google, mobile, chrome, collaboration]"
1,a passion to help people understand how apple products can enrich their livesexcellent communication skills allowing you to be as comfortable in front of a small group as you are speaking with individuals years preferred working in a dynamic sales andor results driven environment as well as proven success developing customer loyaltyability to encourage a partner team and grow apple business,[business],[business]
2,2 years experience in preferably outbound licensing understanding of category manufacturing and sales cycles for toysfoodbeverage preferred experience with entertainmentlifestyle brands self starter proactive flexible thrives under pressure superb organizational and multitasking skills excellent communication skills,[],"[support, business, languages, swift, marketing, workflow, schedule, play]"
3,2 years experience in web design proficiency with adobe creative cloud photoshop illustrator indesign strong html css javascript skills familiarity with content management systems,"[design, javascript, content management]","[support, content management, accounting, marketing, adobe photoshop, javascript, mobile, finance, advertising, design, landing pages, testing]"
4,2 years of experience with html and csssass 2 years of experience with programming php applications and lamp stack development experience with es6 javascript and jquery thorough understanding of relational databases and security relating to phpmysql expert knowledge with content management systems either from your own design or from mvc frameworks such as zend laravel etc ability to build and consume custom soap and rest apis 2 years of experience writing unit tests detailing procedures self motivated requiring minimal supervision exceptional organization and communication skills,"[javascript, jquery, databases, security, content management, design, laravel]","[support, content management, databases, jquery, javascript, laravel, software, security, design]"


In [82]:
# get the skills comparison
def compare_skills(top_skills, model_skills):
    top_skills_set = set(top_skills)
    model_skills_set = set(model_skills)

    # Find intersection (skills present in both)
    common_skills = top_skills_set & model_skills_set

    # Find skills in top_skills but not in model_skills (False Positives)
    false_positives = top_skills_set - model_skills_set

    # Find skills in model_skills but not in top_skills (False Negatives)
    false_negatives = model_skills_set - top_skills_set

    return common_skills, false_positives, false_negatives

# Apply comparison to each row
df['common_skills_06'], df['false_positives_06'], df['false_negatives_06'] = zip(*df.apply(lambda row: compare_skills(row['total_skills_th_0.6'], row['model_response_skills_matched']), axis=1))

# Show the results
df[['total_skills_th_0.6', 'model_response_skills_matched', 'common_skills_06', 'false_positives_06', 'false_negatives_06']].head()

Unnamed: 0,total_skills_th_0.6,model_response_skills_matched,common_skills_06,false_positives_06,false_negatives_06
0,"[content management, business, google, mobile, mobile prototyping interaction design tools, chrome, collaboration]",[],{},"{google, mobile, business, content management, mobile prototyping interaction design tools, chrome, collaboration}",{}
1,"[business, mobile prototyping interaction design tools]",[business],{business},{mobile prototyping interaction design tools},{}
2,"[support, business, languages, swift, marketing, workflow, schedule, play]",[],{},"{support, business, languages, swift, marketing, workflow, schedule, play}",{}
3,"[support, content management, accounting, marketing, adobe photoshop, javascript, mobile, finance, advertising, design, landing pages, testing]","[design, javascript, content management]","{javascript, design, content management}","{support, accounting, marketing, adobe photoshop, mobile, finance, advertising, landing pages, testing}",{}
4,"[support, content management, databases, jquery, javascript, mobile prototyping interaction design tools, laravel, software, security, design]","[javascript, jquery, databases, security, content management, design, laravel]","{content management, databases, jquery, javascript, laravel, security, design}","{support, software, mobile prototyping interaction design tools}",{}


In [90]:
# Function to evaluate the model
def evaluate_skills(predicted_skills, required_skills):
    predicted_set = set(predicted_skills)  # Predicted skills
    required_set = set(required_skills)  # Required skills

    # Calculate True Positives, False Positives, and False Negatives
    true_positives = len(predicted_set & required_set)
    false_positives = len(predicted_set - required_set)
    false_negatives = len(required_set - predicted_set)

    # Precision, Recall, F1-score
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Accuracy
    accuracy = true_positives / (true_positives + false_positives + false_negatives) if (true_positives + false_positives + false_negatives) > 0 else 0

    return precision, recall, f1, accuracy

# Apply evaluation to each row in the DataFrame
df['precision_06'], df['recall_06'], df['f1_score_06'], df['accuracy_06'] = zip(*df.apply(lambda row: evaluate_skills(row['total_skills_th_0.6'], row['model_response_skills_matched']), axis=1))



In [91]:
# Check the evaluation scores
print(df[['accuracy_06', 'precision_06', 'recall_06', 'f1_score_06']].head())
# Calculate the average of each evaluation score column
precision_avg = df['precision_06'].mean()
recall_avg = df['recall_06'].mean()
f1_score_avg = df['f1_score_06'].mean()
acc_avg = df['accuracy_06'].mean()

# Print the averages
print(f"Average Accuracy: {acc_avg}")
print(f"Average Precision: {precision_avg}")
print(f"Average Recall: {recall_avg}")
print(f"Average F1-Score: {f1_score_avg}")

   accuracy_06  precision_06  recall_06  f1_score_06
0         0.00          0.00        0.0     0.000000
1         0.50          0.50        1.0     0.666667
2         0.00          0.00        0.0     0.000000
3         0.25          0.25        1.0     0.400000
4         0.70          0.70        1.0     0.823529
Average Accuracy: 0.16039715498481977
Average Precision: 0.16369593501531418
Average Recall: 0.4651146788602826
Average F1-Score: 0.2252726667907449


In [93]:
# Apply evaluation to each row in the DataFrame
df['precision_055'], df['recall_055'], df['f1_score_055'],df['accuracy_055'] = zip(*df.apply(lambda row: evaluate_skills(row['total_skills_th_0.55'], row['model_response_skills_matched']), axis=1))
# Check the evaluation scores
print(df[['accuracy_055','precision_055', 'recall_055', 'f1_score_055']].head())
# Calculate the average of each evaluation score column
precision_avg = df['precision_055'].mean()
recall_avg = df['recall_055'].mean()
f1_score_avg = df['f1_score_055'].mean()
acc_avg = df['accuracy_055'].mean()

# Print the averages
print(f"Average Accuracy: {acc_avg}")
print(f"Average Precision: {precision_avg}")
print(f"Average Recall: {recall_avg}")
print(f"Average F1-Score: {f1_score_avg}")

   accuracy_055  precision_055  recall_055  f1_score_055
0      0.000000       0.000000         0.0      0.000000
1      0.071429       0.071429         1.0      0.133333
2      0.000000       0.000000         0.0      0.000000
3      0.150000       0.150000         1.0      0.260870
4      0.280000       0.280000         1.0      0.437500
Average Accuracy: 0.09596089359059554
Average Precision: 0.09692865312679977
Average Recall: 0.4651146788602826
Average F1-Score: 0.149968204808918


In [94]:
# Apply evaluation to each row in the DataFrame
df['precision_065'], df['recall_065'], df['f1_score_065'],df['accuracy_065'] = zip(*df.apply(lambda row: evaluate_skills(row['total_skills_th_0.55'], row['model_response_skills_matched']), axis=1))
# Check the evaluation scores
print(df[['accuracy_065','precision_065', 'recall_065', 'f1_score_065']].head())
# Calculate the average of each evaluation score column
precision_avg = df['precision_065'].mean()
recall_avg = df['recall_065'].mean()
f1_score_avg = df['f1_score_065'].mean()
acc_avg = df['accuracy_065'].mean()

# Print the averages
print(f"Average Accuracy: {acc_avg}")
print(f"Average Precision: {precision_avg}")
print(f"Average Recall: {recall_avg}")
print(f"Average F1-Score: {f1_score_avg}")

   accuracy_065  precision_065  recall_065  f1_score_065
0      0.000000       0.000000         0.0      0.000000
1      0.071429       0.071429         1.0      0.133333
2      0.000000       0.000000         0.0      0.000000
3      0.150000       0.150000         1.0      0.260870
4      0.280000       0.280000         1.0      0.437500
Average Accuracy: 0.09596089359059554
Average Precision: 0.09692865312679977
Average Recall: 0.4651146788602826
Average F1-Score: 0.149968204808918


In [95]:
df.to_csv('processed_df_with_scores.csv',sep=';')