# Skills embedding
The goal of this notebook is to embed skills in the Saudi Indeed dataset for future skills normalization. The model used for embedding will be "text-embedding-3-large", as it is the same used in the USA branch of the project. 

In [243]:
# Import relevant packages
import pandas as pd
import numpy as np

# Haystack imports
from haystack.components.embedders import OpenAITextEmbedder
from haystack import Pipeline
from haystack import component


In [244]:
# Add helper function to load env variables.

import os
from dotenv import load_dotenv, find_dotenv

# This function expect to find a .env file at the directory above the current directory                                                                                                                     # the format for that file is (without the comment)                                                                                                                                       #API_KEYNAME=AStringThatIsTheLongAPIKeyFromSomeService                                                                                                                                     
def load_env():
    _ = load_dotenv(find_dotenv())


load_env()


# Dataset
This notebook will only use a small sample of the data to test an embedding process. If it is sucesfull, the same process can be used for the complete dataset later on.

In [245]:
# Load data

data = pd.read_csv("test_data_llm_skill_extaction.csv")

In [246]:
# Inspect target rows in the data

data[['key', 'date', 'LLM_tools', 'LLM_soft_skills', 'LLM_technical_skills']]

Unnamed: 0,key,date,LLM_tools,LLM_soft_skills,LLM_technical_skills
0,bb0787efdbbe84b7,2025-03-24,"['Cisco', 'AI', 'Machine Learning', 'Kubernete...","['communication', 'teamwork', 'problem-solving...","['networking', 'digital transformation', 'auto..."
1,8f7f57f4dcbc94d1,2025-03-20,"['Excel', 'Google Sheets', 'Power BI', 'Google...","['communication', 'problem-solving', 'critical...","['data analysis', 'forecasting', 'predictive a..."
2,48d65bcc10545a3e,2025-03-20,"['remote diagnostic tools', 'remote assistance...","['communication', 'teamwork', 'problem-solving...","['technical support', 'data governance', 'data..."
3,0ecff04544a49fed,2025-03-20,"['SQL', 'Tableau', 'Power BI']","['communication', 'teamwork', 'problem-solving...","['data analysis', 'data visualization', 'stati..."
4,c20878f33a483cbc,2025-03-20,"['SQL', 'Oracle', 'SQL Server', 'MySQL', 'Info...","['communication', 'teamwork', 'problem-solving...","['data warehousing', 'ETL', 'data integration'..."
5,81b6c9a7ba1e1c9c,2025-03-20,"['Informatica MDM', 'Informatica MDM Hub', 'In...","['communication', 'collaboration', 'problem-so...","['data governance', 'data quality management',..."
6,4e2a25fec2ac8cb2,2025-03-24,"['Python', 'R', 'Tableau', 'Power BI', 'Azure'...","['leadership', 'mentorship', 'teamwork', 'comm...","['data analysis', 'machine learning', 'statist..."
7,212e5fbdefc45fc1,2025-03-24,"['Python', 'Git', 'PostgreSQL', 'MySQL', 'Apac...","['communication', 'teamwork', 'problem-solving...","['Machine Learning', 'data management', 'distr..."
8,a79f6bc84b521ef5,2025-03-24,"['AI', 'Machine Learning', 'SaaS']","['communication', 'problem-solving', 'teamwork...","['product management', 'product strategy', 'da..."
9,0e7e177cbffb1ff8,2025-03-24,"['Python', 'R', 'Hadoop', 'Spark', 'Tableau', ...","['communication', 'teamwork', 'leadership', 'p...","['Data Science', 'Machine Learning', 'Statisti..."


# Skills embedding

In [None]:
# Build a haystack component that can be used to fetch skills Dataframe

@component
class SkillsFetcher:

    '''
    Fetch information from a row in the dataset regarding key, publication date, llm extracted tool, 
    llm extracted soft skill, and llm extracted technical skills.
    This information can be subsequently embedded by a llm model.
    '''

    @component.output_types(job_info=str)
    def run(self, df, row_number: int): # The component requires a DataFrame a row number (integer position based)
        key = df.iloc[row_number]["key"]
        date = df.iloc[row_number]["date"]
        tools = df.iloc[row_number]["LLM_tools"]
        if isinstance(tools, str):
            tools = tools.replace("'", "").replace("[", "").replace("]", "").split(", ")
        soft_skills = df.iloc[row_number]["LLM_soft_skills"]
        if isinstance(soft_skills, str):
            soft_skills = soft_skills.replace("'", "").replace("[", "").replace("]", "").split(", ")
        technical_skills = df.iloc[row_number]["LLM_technical_skills"]
        if isinstance(technical_skills, str):
            technical_skills = technical_skills.replace("'", "").replace("[", "").replace("]", "").split(", ")
        return {"key": key,
                "date": date,
                "LLM_extracted_tools": tools,
                "LLM_extracted_soft_skills": soft_skills,
                "LLM_extracted_technical_skills": technical_skills}

In [248]:
# Test SkillsFetcher

# Instatiate the SkillsFetcher
fetcher = SkillsFetcher()

# Make a test run
response = fetcher.run(df=data, row_number=0)
response

{'key': 'bb0787efdbbe84b7',
 'date': '2025-03-24',
 'LLM_extracted_tools': ['Cisco',
  'AI',
  'Machine Learning',
  'Kubernetes',
  'Docker',
  'SD-WAN',
  'ACI',
  'DNA',
  'Security',
  'SASE',
  'Cloud'],
 'LLM_extracted_soft_skills': ['communication',
  'teamwork',
  'problem-solving',
  'leadership',
  'adaptability',
  'credibility',
  'relationship building',
  'consultative skills',
  'technical consulting',
  'presentation skills'],
 'LLM_extracted_technical_skills': ['networking',
  'digital transformation',
  'automation',
  'cloud-native applications',
  'MLOps',
  'AIOps',
  'CI/CD pipelines',
  'routing and switching',
  'data center',
  'security',
  'pre-sales strategy',
  'business modeling',
  'technical presentations']}

In [249]:
# Define text embedder component

text_embedder = OpenAITextEmbedder(model="text-embedding-3-large")

In [250]:
# Run the skills fetcher and pass its response for the text embedder

# Create a empty list to hold data
embeddings = []

for i in range(len(data)):
    response = fetcher.run(df=data, row_number=i)

    # Embed tools
    for tool in response['LLM_extracted_tools']:
        try:
           embbeding = text_embedder.run(skill)
        except Exception as e:
            print(f"Error embedding tools for job key {response['key']}: {e}")
            embbeding = {"embedding": []}
            continue
        embeddings.append(pd.DataFrame({"job_key": response["key"], "skill": tool, "skill_type": "tools", "embedding": [embbeding['embedding']], "pub_date": response["date"]}))

    # Embed soft skills
    for soft_skill in response['LLM_extracted_soft_skills']:
        try:
            embbeding = text_embedder.run(soft_skill)
        except Exception as e:
            print(f"Error embedding soft skills for job key {response['key']}: {e}")
            embbeding = {"embedding": []}
            continue
        embeddings.append(pd.DataFrame({"job_key": response["key"], "skill": soft_skill, "skill_type": "soft_skills", "embedding": [embbeding['embedding']], "pub_date": response["date"]}))

    # Embed technical skills
    for technical_skill in response['LLM_extracted_technical_skills']:
        try:
            embbeding = text_embedder.run(technical_skill)
        except Exception as e:
            print(f"Error embedding technical skills for job key {response['key']}: {e}")
            embbeding = {"embedding": []}
            continue
        embeddings.append(pd.DataFrame({"job_key": response["key"], "skill": technical_skill, "skill_type": "technical_skills", "embedding": [embbeding['embedding']], "pub_date": response["date"]}))

In [251]:
# Concatenate all the dataframes into embeddings list a single dataframe

embedded_skills_df = pd.concat(embeddings)

In [252]:
# Inspect final result

embedded_skills_df.reset_index(drop=True).head()

Unnamed: 0,job_key,skill,skill_type,embedding,pub_date
0,bb0787efdbbe84b7,Cisco,tools,"[0.01550822239369154, 0.00821499153971672, -0....",2025-03-24
1,bb0787efdbbe84b7,AI,tools,"[0.01550822239369154, 0.00821499153971672, -0....",2025-03-24
2,bb0787efdbbe84b7,Machine Learning,tools,"[0.01550822239369154, 0.00821499153971672, -0....",2025-03-24
3,bb0787efdbbe84b7,Kubernetes,tools,"[0.01550822239369154, 0.00821499153971672, -0....",2025-03-24
4,bb0787efdbbe84b7,Docker,tools,"[0.01550822239369154, 0.00821499153971672, -0....",2025-03-24


In [253]:
# Save the embedded skills to a CSV file
embedded_skills_df.to_csv("embedded_skills_test.csv", index=False)

### Final remarks
The process depicted here produces a similar output to the one generated by the USA branch of the Job Trends Project. Therefore, it is adequate and can be applied over the entire dataset to produce embedding for skills.