In [1]:
import pandas as pd
import numpy as np

#File manipulation
import os

#Text processing and cleaning
import contractions # To include english contractions
import re #regex
import string #used to include punctuation during text processing
from collections import Counter #count strings in texts

#Natural Language Tool Kit NLK package
import nltk
from nltk.corpus import stopwords #Stopwords
from nltk.tokenize  import sent_tokenize ,  word_tokenize # Word and sentence tokenizer
from nltk import pos_tag, ngrams #N-grams analysis
from nltk.stem import PorterStemmer, WordNetLemmatizer #Lemmatizer and Stemmer
from nltk.text import Text #for concordance
from nltk.collocations import * 
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder #collocations
from nltk import BigramAssocMeasures, TrigramAssocMeasures  # Measures for evaluating bigram associations
from nltk import bigrams # Generate bigrams from text data


#Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import networkx as nx #Used for network graph

#Topic modeling/ Clustering
from sklearn.feature_extraction.text import CountVectorizer # Bag-of-Words model
from sklearn.decomposition import LatentDirichletAllocation #LDA for topic modeling
from sklearn.feature_extraction.text import TfidfVectorizer #term-frequency inverse document frequency vectorizer
from scipy.cluster.hierarchy import dendrogram, linkage #Hierarchical clustering
from sklearn.metrics import homogeneity_score, completeness_score #for quality of cluestering



In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rodzaraya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rodzaraya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rodzaraya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
df = pd.read_csv('../data/job_descriptions/training_Data.csv')
df

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."
...,...,...,...,...,...
848,Menards,job description\n\nparttime\n\nmake big money ...,Management Internship,1122,"{\n ""Core Responsibilities"": ""Responsibiliti..."
849,Parker,responsibilities\nparkers internship program w...,Human Resources Internship - Corporate (Year-...,3840,"{\n ""Core Responsibilities"": ""Assist in gene..."
850,Borgen Project,the borgen project is an innovative national ...,Writer / Journalist Internship,897,"{\n ""Core Responsibilities"": ""Write one arti..."
851,Wyndham Destinations,put the world on vacation\n\nat wyndham destin...,Inbound Customer Service / Sales (Remote),4604,"{\n ""Core Responsibilities"": ""Answer inbound..."


In [4]:
df.model_response[0]

' {\n  "Core Responsibilities": "Responsible for expanding Google Workspace product adoption across an assigned territory. Build relationships with customers to understand needs and provide Google Workspace solutions. Partner with account teams to construct solutions and grow business for Google Workspace.",\n  "Required Skills": "Bachelor\'s degree or equivalent experience. Experience managing enterprise SaaS accounts and sales cycles.", \n  "Educational Requirements": "Bachelor\'s degree or equivalent experience.",\n  "Experience Level": "Experience managing enterprise SaaS accounts and sales cycles.",\n  "Preferred Qualifications": "Experience building strategic partnerships with enterprise customers. Ability to work through a reseller ecosystem. Excellent communication and strategic thinking skills.",\n  "Compensation and Benefits": "N/A"\n}'

In [5]:
class TextPreprocessor:
    def __init__(self, processing_mode='none', custom_punctuation=None, custom_stopwords=None, sentence_analysis=False):
        """
            Initialization considers Custom punctuation, Stop words, and Lemmatizer or Stemmer.
            Updates custom punctuation and custom stop words set with additional ones if provided.
            The processing mode to standardise variants can be choose between none, Stem and Lemma. Each mode is stored in a different
            column of the dataframe.
            Sentence analysis parameter is used to keep the punctuation symbols required for sentence analysis.

            Parameters:
            - processing_mode: String to decide whether to use 'lemma', 'stem', or 'none' for text processing.
            - custom_punctuation: Additional punctuation characters to remove from text.
            - custom_stopwords: Additional stopwords to remove from text.
            - sentence_analysis: Boolean indicating sentence analysis cleaning steps. This mode will keep the punctuation symbols. 
            
            """
        
        self.punctuation = string.punctuation #Init with all punctuation characters
        
        if custom_punctuation:
            self.punctuation += custom_punctuation #add custom punctuation

        self.stop_words = set(stopwords.words('english'))
        if custom_stopwords:
            self.stop_words.update(custom_stopwords) #add custom stopwords
        
        # Determine which text processing mode to use
        self.processing_mode = processing_mode.lower()
        
        # Set the sentence analysis mode
        self.sentence_analysis = sentence_analysis
        
        #Set the variant standardization mode
        if self.processing_mode == 'lemma':
            self.lemmatizer = WordNetLemmatizer()
        elif self.processing_mode == 'stem':
            self.stemmer = PorterStemmer()

    # Expand contractions using the contractions library
    def expand_contractions(self, text):
        return contractions.fix(text)

    # Split hyphenated words into separate words, like phone numbers or radio fm, age, etc.
    def split_hyphenated_words(self, text):
        return re.sub(r'-', ' ', text)

    def remove_punctuation(self, text):
        return ''.join([char for char in text if char not in self.punctuation])

    def add_space_after_parenthesis(self, text):
        return re.sub(r'\)', ') ', text)

    def to_lowercase(self, text):
        return text.lower()

    def remove_stopwords(self, text):
        words = word_tokenize(text)
        return ' '.join([word for word in words if word not in self.stop_words])

    def remove_extra_whitespace(self, text):
        return re.sub(r'\s+', ' ', text).strip()

    def stem_words(self, text):
        words = word_tokenize(text)
        return ' '.join([self.stemmer.stem(word) for word in words])

    def lemmatize_words(self, text):
        words = word_tokenize(text)
        return ' '.join([self.lemmatizer.lemmatize(word) for word in words])

    # Order matters
    def preprocess(self, text):
        text = self.expand_contractions(text)
        text = self.split_hyphenated_words(text)
        text = self.add_space_after_parenthesis(text)
        
        #In case we need to analyse sentences, we will need the punctuations
        if not self.sentence_analysis:
            text = self.remove_punctuation(text)
        text = self.to_lowercase(text)
        #The stopwords are removed if the users wants to standardise variants.
        #If none is selected, the ouput will just perform previous cleaning steps
        if self.processing_mode != 'none':
            text = self.remove_stopwords(text)
            
        text = self.remove_extra_whitespace(text)
        
        #Select the processing mode for variants
        if self.processing_mode == 'lemma':
            text = self.lemmatize_words(text)
        elif self.processing_mode == 'stem':
            text = self.stem_words(text)
        
        return text

    #Apply preprocessing steps to daframe and create a column base on the processing mode
    def preprocess_dataframe(self, df, column_name):
        if not self.sentence_analysis:
            if self.processing_mode == 'lemma':
                df['processed_lemma'] = df[column_name].apply(self.preprocess)
            elif self.processing_mode == 'stem':
                df['processed_stem'] = df[column_name].apply(self.preprocess)
            else:  # If 'none', apply preprocessing without lemma or stem
                df['processed_cleaned'] = df[column_name].apply(self.preprocess)
        else: # Add different processed columns for sentences
            if self.processing_mode == 'lemma':
                df['processed_lemma_sent'] = df[column_name].apply(self.preprocess)
            elif self.processing_mode == 'stem':
                df['processed_stem_sent'] = df[column_name].apply(self.preprocess)
            else:  # If 'none', apply preprocessing without lemma or stem
                df['processed_cleaned_sent'] = df[column_name].apply(self.preprocess)
        return df

In [6]:
text_preprocessor = TextPreprocessor(processing_mode='none')
df = text_preprocessor.preprocess_dataframe(df, 'job_description')
df

Unnamed: 0,company_name,job_description,position_title,description_length,model_response,processed_cleaned
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo...",minimum qualifications bachelors degree or equ...
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ...",description as an asc you will be highly influ...
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus...",its an amazing time to be joining netflix as w...
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs...",description web designers looking to expand yo...
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo...",at trackfive we have got big goals were on a m...
...,...,...,...,...,...,...
848,Menards,job description\n\nparttime\n\nmake big money ...,Management Internship,1122,"{\n ""Core Responsibilities"": ""Responsibiliti...",job description parttime make big money at men...
849,Parker,responsibilities\nparkers internship program w...,Human Resources Internship - Corporate (Year-...,3840,"{\n ""Core Responsibilities"": ""Assist in gene...",responsibilities parkers internship program wa...
850,Borgen Project,the borgen project is an innovative national ...,Writer / Journalist Internship,897,"{\n ""Core Responsibilities"": ""Write one arti...",the borgen project is an innovative national c...
851,Wyndham Destinations,put the world on vacation\n\nat wyndham destin...,Inbound Customer Service / Sales (Remote),4604,"{\n ""Core Responsibilities"": ""Answer inbound...",put the world on vacation at wyndham destinati...


In [7]:
print(df.processed_cleaned[0])

minimum qualifications bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles preferred qualifications years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills about the job as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organizations more producti

In [9]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    sentences = sent_tokenize(text)
    features = {'feature': ""}
    stop_words = set(stopwords.words("english"))
    for sent in sentences:
        if any(criteria in sent for criteria in ['skills', 'education']):
            words = word_tokenize(sent)
            words = [word for word in words if word not in stop_words]
            tagged_words = pos_tag(words) # Part of speech
            filtered_words = [word for word, tag in tagged_words if tag not in ['DT', 'IN', 'TO', 'PRP', 'WP']]
            features['feature'] += " ".join(filtered_words)
    return features

In [10]:
df['Features'] = df['job_description'].apply(lambda x : preprocess_text(x)['feature'])
df['Features']

0      minimum qualifications bachelors degree equiva...
1      description asc highly influential growing min...
2      amazing time joining netflix continue transfor...
3      description web designers looking expand profe...
4      trackfive weve got big goals mission revolutio...
                             ...                        
848    job description parttime make big money menard...
849    responsibilities parkers internship program es...
850    borgen project innovative national campaign wo...
851                                                     
852    job handles customer inquiries telephone andor...
Name: Features, Length: 853, dtype: object

In [7]:
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def get_embeddings(text, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    inputs = tokenizer(str(text), return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().to("cpu").numpy()
    return embeddings

# Select device (MPS for Mac, CUDA for NVIDIA GPUs, CPU as a fallback)
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")   
    
#device = torch.device("cpu")  
print(device)

mps


In [9]:
job_description = df[["job_description", "position_title",'processed_cleaned']][:15]


In [10]:
job_description

Unnamed: 0,job_description,position_title,processed_cleaned
0,minimum qualifications\nbachelors degree or eq...,Sales Specialist,minimum qualifications bachelors degree or equ...
1,description\nas an asc you will be highly infl...,Apple Solutions Consultant,description as an asc you will be highly influ...
2,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,its an amazing time to be joining netflix as w...
3,description\n\nweb designers looking to expand...,Web Designer,description web designers looking to expand yo...
4,at trackfive weve got big goals were on a miss...,Web Developer,at trackfive we have got big goals were on a m...
5,designups is a nashville based design and inte...,Frontend Web Developer,designups is a nashville based design and inte...
6,about the position\n\nthe web designer is resp...,Remote Website Designer,about the position the web designer is respons...
7,job description\n\nzander insurance group is o...,Web Designer,job description zander insurance group is one ...
8,tuff is a growth marketing team working with c...,Web Designer,tuff is a growth marketing team working with c...
9,type of requisition regular\n\nclearance level...,SR. Web Designer,type of requisition regular clearance level mu...


In [None]:
model_name = "bert-base-uncased"
#model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:

model_name = "bert-base-uncased"


model.to(device)

job_desc_embeddings = np.array([get_embeddings(desc, model_name) for desc in job_description['processed_cleaned']]).squeeze()

In [8]:
import spacy
from spacy.matcher import Matcher
import json

# Load the spaCy model (small English model is sufficient for matching)
nlp = spacy.load("en_core_web_sm")



In [10]:


# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Load patterns from the JSONL file
patterns = []
with open('../data/jz_skill_patterns.jsonl', 'r') as f:
    for line in f:
        patterns.append(json.loads(line))

# Add patterns to the matcher
for pattern in patterns:
    matcher.add(pattern['label'], [pattern['pattern']])

# Define a function to apply the matcher and find skills in the text
def find_skills(text):
    doc = nlp(text)
    matches = matcher(doc)
    skills = set()  # To store found skills
    for match_id, start, end in matches:
        skill = doc[start:end].text
        skills.add(skill)
    return skills

# Apply the find_skills function to the 'Features' column of the DataFrame
df['Skills Found'] = df['processed_cleaned'].apply(find_skills)

# Display the new column with the found skills
df[['processed_cleaned', 'Skills Found']]


Unnamed: 0,processed_cleaned,Skills Found
0,minimum qualifications bachelors degree or equ...,"{business, content management, google, mobile,..."
1,description as an asc you will be highly influ...,{business}
2,its an amazing time to be joining netflix as w...,"{play, support, swift, business, schedule, wor..."
3,description web designers looking to expand yo...,"{javascript, support, landing pages, finance, ..."
4,at trackfive we have got big goals were on a m...,"{software, javascript, support, security, desi..."
...,...,...
848,job description parttime make big money at men...,{business}
849,responsibilities parkers internship program wa...,"{software, support, engineering, testing, busi..."
850,the borgen project is an innovative national c...,{schedule}
851,put the world on vacation at wyndham destinati...,"{schedule, router, business}"


In [11]:
def extract_skills_from_json(json_data):
    # Parse the model_response (JSON format in string form)
    parsed = json.loads(json_data)
    
    # Extract relevant fields: Required Skills, Educational Requirements, etc.
    required_skills = parsed.get("Required Skills", "")
    education_requirements = parsed.get("Educational Requirements", "")
    experience_level = parsed.get("Experience Level", "")
    
      # Convert any list fields to strings (join lists with spaces)
    if isinstance(required_skills, list):
        required_skills = " ".join(required_skills)
    
    if isinstance(education_requirements, list):
        education_requirements = " ".join(education_requirements)
    
    if isinstance(experience_level, list):
        experience_level = " ".join(experience_level)
    
    # Combine the relevant sections into one string for training
    combined_text = required_skills + " " + education_requirements + " " + experience_level
    
    return combined_text.strip()
    

# Apply the function to the 'model_response' column to extract text
df['Extracted_Text'] = df['model_response'].apply(extract_skills_from_json)

# Display the extracted text
df[['model_response', 'Extracted_Text']].head()

Unnamed: 0,model_response,Extracted_Text
0,"{\n ""Core Responsibilities"": ""Responsible fo...",Bachelor's degree or equivalent experience. Ex...
1,"{\n ""Core Responsibilities"": ""as an asc you ...",a passion to help people understand how apple ...
2,"{\n ""Core Responsibilities"": ""Help drive bus...",2+ years experience in preferably outbound lic...
3,"{\n ""Core Responsibilities"": ""Designing webs...",2+ years experience in web design. Proficiency...
4,"{\n ""Core Responsibilities"": ""Build and layo...","2+ years of experience with HTML and CSS/SASS,..."


In [12]:
#spacy

from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc


In [13]:
# Function to extract skills from text using the custom entity ruler
def get_skills(text):
    doc = nlp(text)
    skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
    return skills

# Ensure unique skills
def unique_skills(skill_list):
    return list(set(skill_list))


In [14]:
# Create an EntityRuler component explicitly using create_pipe and add it to the pipeline
ruler = nlp.create_pipe("entity_ruler")
nlp.add_pipe(ruler, before="ner")

# Load the patterns from a file (assuming 'patterns' is a valid path to the JSONL file)
ruler.from_disk('../data/jz_skill_patterns.jsonl')

# Check the pipeline components
print(nlp.pipe_names)


# Apply the function to the 'Extracted_Text' column
df["skills"] = df["processed_cleaned"].str.lower().apply(get_skills)


# Show the result
df[['processed_cleaned', 'skills']].head()

['tagger', 'parser', 'entity_ruler', 'ner']


Unnamed: 0,processed_cleaned,skills
0,minimum qualifications bachelors degree or equ...,"[business, business, google, google, google, c..."
1,description as an asc you will be highly influ...,[business]
2,its an amazing time to be joining netflix as w...,"[languages, play, support, support, workflow, ..."
3,description web designers looking to expand yo...,"[marketing, mobile, landing pages, testing, de..."
4,at trackfive we have got big goals were on a m...,"[support, software, databases, javascript, jqu..."


In [15]:
df[['skills','Skills Found']].iloc[0]

skills          [business, business, google, google, google, c...
Skills Found    {business, content management, google, mobile,...
Name: 0, dtype: object

In [12]:
# Load skills from the patterns file (assuming it’s a JSONL file)
# Load skills from the provided JSON file
def load_skills_from_json(skills_file):
    skills_list = []
    with open(skills_file, 'r') as f:
        data = json.load(f)  # Load the entire JSON file
        for skill in data.keys():  # The keys at the top level are the skill names
            skills_list.append(skill)  # Add the skill to the list
    return skills_list

# Load the skills from your JSON file
skills_list = load_skills_from_json('../data/skills.json')

# Print the loaded skills
print("Skills loaded from JSON:", skills_list)

NameError: name 'json' is not defined

In [27]:
print(len(skills_list))

1991


In [28]:
# Step 3: Generate skill embeddings
def get_skill_embeddings(skills, model, tokenizer):
    skill_embeddings = []
    for skill in skills:
        inputs = tokenizer(skill, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().to("cpu").numpy()
        skill_embeddings.append(embeddings)
    return np.array(skill_embeddings)

# Generate embeddings for the skills
skill_embeddings = get_skill_embeddings(skills_list, model, tokenizer)

# Step 4: Compute similarity and find top skills for each job description
def find_top_skills(job_desc_embedding, skill_embeddings, skills_list, top_n=5):
    similarities = cosine_similarity(job_desc_embedding, skill_embeddings).flatten()
    top_n_indices = similarities.argsort()[-top_n:][::-1]  # Sort and get top N in descending order
    top_skills = [skills_list[i] for i in top_n_indices]
    return top_skills

# Apply the top skills function to each job description in your DataFrame
df['top_skills'] = df['Features'].apply(lambda x: find_top_skills(get_embeddings(x, model_name), skill_embeddings, skills_list))

# Show the results
print(df[['Features', 'top_skills']].head())

NameError: name 'get_skill_embeddings' is not defined