In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
%pip install skillNer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

GLOVE_PATH = "/kaggle/input/skill-extractor/glove.6B.100d.txt"  # Path to GloVe file
SKILLS_EXCEL = "/kaggle/input/skill-extractor/Merged_Unique_Skills.csv"      # Excel with 'Skill' column
INPUT_CSV = "/kaggle/input/skill-extractor/Linkedin_data_no_duplicates v1.0.csv"      # Input CSV file with a column "job_desc"
OUTPUT_EXCEL = "ngram_skills.xlsx"
OUTPUT_CSV = "job_posting_with_skills.csv"
OUTPUT_GLOVE_EXCEL = "matched_skills_glove.xlsx"
OUTPUT_NER_EXCEL = "matched_skills_ner.xlsx"
EMBEDDING_DIM = 100
SIMILARITY_THRESHOLD = 0.89

def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = list(map(float, parts[1:]))
            embeddings[word] = vector
    return embeddings




Collecting skillNer
  Downloading skillNer-1.0.3.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: skillNer
  Building wheel for skillNer (setup.py) ... [?25l[?25hdone
  Created wheel for skillNer: filename=skillNer-1.0.3-py3-none-any.whl size=25625 sha256=4e94c84de05134b24976f92990d5b678950f7e98c639d14ece83a2cdc3a2534f
  Stored in directory: /root/.cache/pip/wheels/62/01/98/b823d6086aacca94c7d9083081aee3effca467bedb621410e9
Successfully built skillNer
Installing collected packages: skillNer
Successfully installed skillNer-1.0.3
Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
glove = load_glove_embeddings(GLOVE_PATH)

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from numpy.linalg import norm

# For NER-based extraction with GPU support
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from skillNer.cleaner import Cleaner
import spacy
import torch

# For progress bar
from tqdm import tqdm

# Uncomment these if needed:
# nltk.download('punkt')
# nltk.download('stopwords')

# ---------------------------------------------
#        GloVe-Based Extraction Functions
# ---------------------------------------------
ps = PorterStemmer()

def get_stem_set(text):
    """
    Returns a set of stemmed tokens for a given text.
    Uses strict normalization (removes punctuation but keeps stopwords) and stems each token.
    """
    text = re.sub(r'[-]', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text.lower().strip())
    return set(ps.stem(token) for token in tokens)

def load_glove_embeddings(file_path):
    """Loads GloVe embeddings from a file."""
    print("Loading GloVe embeddings...")
    embeddings = {}
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = list(map(float, parts[1:]))
            embeddings[word] = vector
    print(f"Loaded {len(embeddings)} word vectors.")
    return embeddings

stop_words = set(stopwords.words('english'))

def preprocess(text):
    """
    Remove hyphens (convert to space), then remove non-alphabetic characters,
    lowercase, tokenize, and remove stopwords.
    Returns a list of tokens.
    """
    text = re.sub(r'[-]', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    tokens = word_tokenize(text.lower())
    return [t for t in tokens if t not in stop_words]

def normalize_text(text):
    """
    Normalizes text for matching by preprocessing and then joining tokens.
    (Stopwords are removed.)
    """
    tokens = preprocess(text)
    return ' '.join(tokens)

def strict_normalize(text):
    """
    Strict normalization: replace hyphens with space, remove punctuation,
    lowercase the text, but do not remove stopwords.
    """
    text = re.sub(r'[-]', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower().strip()

def average_vector(tokens, glove, dim=100):
    """
    Computes an average vector for a list of tokens using the provided GloVe embeddings.
    """
    vectors = [glove[t] for t in tokens if t in glove]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

def cosine_similarity(vec1, vec2):
    """
    Returns cosine similarity between two vectors.
    """
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2) + 1e-8)

def generate_ngrams(tokens, n_range=(1, 3)):
    """
    Generates 1–3 word n-grams from a list of tokens.
    """
    ngrams = []
    for n in range(n_range[0], n_range[1] + 1):
        ngrams += [' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
    return ngrams

def match_phrases_to_skills(job_desc, skill_list, glove, threshold=0.6, dim=100):
    """
    Matches phrases in the job description to a list of skills using cosine similarity of GloVe embeddings.
    Returns a sorted list of tuples: (Skill, Similarity, Matched Phrase)
    """
    tokens = preprocess(job_desc)
    phrases = generate_ngrams(tokens, n_range=(1, 3))
    
    normalized_skill_dict = {}
    for skill in skill_list:
        norm_skill = normalize_text(skill)
        if 1 <= len(norm_skill.split()) <= 3:
            normalized_skill_dict[norm_skill] = skill

    matched_phrases = []
    
    for phrase in phrases:
        norm_phrase = normalize_text(phrase)
        if norm_phrase in normalized_skill_dict:
            candidate = normalized_skill_dict[norm_phrase]
            extracted_stems = get_stem_set(phrase)
            candidate_stems = get_stem_set(candidate)
            if len(extracted_stems) < len(candidate_stems) and extracted_stems.issubset(candidate_stems):
                continue
            matched_phrases.append((candidate, 1.0, phrase))
        else:
            best_match = None
            best_score = 0
            for norm_skill, orig_skill in normalized_skill_dict.items():
                skill_tokens = preprocess(orig_skill)
                skill_vec = average_vector(skill_tokens, glove, dim)
                phrase_tokens = phrase.split()
                phrase_vec = average_vector(phrase_tokens, glove, dim)
                sim = cosine_similarity(skill_vec, phrase_vec)
                if sim > best_score:
                    best_score = sim
                    best_match = orig_skill
            if best_score >= threshold:
                extracted_stems = get_stem_set(phrase)
                candidate_stems = get_stem_set(best_match)
                if len(extracted_stems) < len(candidate_stems) and extracted_stems.issubset(candidate_stems):
                    continue
                matched_phrases.append((best_match, best_score, phrase))
    
    return sorted(matched_phrases, key=lambda x: -x[1])

def extract_glove_skills(job_desc, glove, skill_list, similarity_threshold=0.6, emb_dim=100):
    """
    Extracts skills from a job description using the GloVe-based matching method.
    Returns a semicolon-separated string of "skill:score" pairs.
    """
    glove_results = match_phrases_to_skills(job_desc, skill_list, glove, threshold=similarity_threshold, dim=emb_dim)
    unique_matches = {}
    for skill, score, _ in glove_results:
        if skill not in unique_matches or score > unique_matches[skill]:
            unique_matches[skill] = score
    return "; ".join([f"{skill}:{unique_matches[skill]:.2f}" for skill in unique_matches])

# ---------------------------------------------
#          NER-Based Extraction Functions
# ---------------------------------------------
def extract_ner_skills(job_desc, ner_pipeline, cleaner):
    """
    Extracts skills from a job description using a Hugging Face NER pipeline.
    Returns a semicolon-separated string of "skill:score" pairs.
    """
    job_desc_clean = cleaner(job_desc)
    ner_results = ner_pipeline(job_desc_clean)
    unique_skills_ner = {}
    for ent in ner_results:
        skill = ent['word']
        score = ent['score']
        if skill in unique_skills_ner:
            unique_skills_ner[skill] = max(unique_skills_ner[skill], score)
        else:
            unique_skills_ner[skill] = score
    return "; ".join([f"{skill}:{unique_skills_ner[skill]:.2f}" for skill in unique_skills_ner])

# ---------------------------------------------
#            Main Processing Script
# ---------------------------------------------
if __name__ == "__main__":
    print("========== Starting Processing ==========")
    # ---------------------------
    # 1. Set Paths and Parameters
    # ---------------------------                     # GloVe embedding dimensions

    # ---------------------------
    # 2. Load GloVe Embeddings & Skill List
    # ---------------------------    
    print("Loading skill list...")
    df_skills = pd.read_csv(SKILLS_EXCEL)
    full_skill_list = df_skills['Skill'].dropna().tolist()
    filtered_skill_list = [skill for skill in full_skill_list if 1 <= len(normalize_text(skill).split()) <= 3]
    print(f"Filtered skill list: {len(filtered_skill_list)} skills retained.")

    # ---------------------------
    # 3. Set Up NER Pipeline with GPU Optimization
    # ---------------------------
    device = 0 if torch.cuda.is_available() else -1
    print(f"Setting up NER pipeline (using {'GPU' if device==0 else 'CPU'})...")
    nlp = spacy.load("en_core_web_sm")
    cleaner = Cleaner(
        to_lowercase=True,
        include_cleaning_functions=[
            "remove_punctuation", "remove_extra_space", "remove_redundant", "lem_text(nlp)"
        ]
    )
    model_name = "algiraldohe/lm-ner-linkedin-skills-recognition"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=device, aggregation_strategy="simple")
    
    # ---------------------------
    # 4. Read Input CSV and Process Each Job Description
    # ---------------------------
    print("Reading input CSV file...")
    headers = pd.read_csv(INPUT_CSV, nrows=0).columns

    # Then read from line 250 onwards, keeping the headers manually
    df_jobs = pd.read_csv(INPUT_CSV, skiprows=range(1, 249), header=None, names=headers)

    # df_jobs = pd.read_csv(INPUT_CSV, skiprows=250)
    
    if "description" not in df_jobs.columns:
        raise ValueError("Input CSV must contain a column named 'description'")
    
    # Initialize new columns
    df_jobs["Glove_Skills"] = ""
    df_jobs["NER_Skills"] = ""
    
    print("Extracting skills for each job description (this may take a while)...")
    # Process rows one at a time and save progress every 50 jobs.
    for idx in tqdm(df_jobs.index, desc="Processing jobs"):
        job_description = df_jobs.at[idx, "description"]
        glove_skills = extract_glove_skills(job_description, glove, filtered_skill_list, SIMILARITY_THRESHOLD, EMBEDDING_DIM)
        ner_skills = extract_ner_skills(job_description, ner_pipeline, cleaner)
        df_jobs.at[idx, "Glove_Skills"] = glove_skills
        df_jobs.at[idx, "NER_Skills"] = ner_skills
        
        # Save progress every 50 jobs
        if (idx + 1) % 50 == 0:
            df_jobs.to_excel(OUTPUT_EXCEL, index=False)
            print(f"Saved progress after processing {idx + 1} job descriptions.")
    
    # Final save after processing all job descriptions.
    df_jobs.to_excel(OUTPUT_EXCEL, index=False)
    print(f"Processing completed. Enriched Excel saved as: {OUTPUT_EXCEL}")
    print("========== All Done ==========")


Loading skill list...
Filtered skill list: 1233 skills retained.
Setting up NER pipeline (using GPU)...


Device set to use cuda:0


Reading input CSV file...
Extracting skills for each job description (this may take a while)...


Processing jobs:   3%|▎         | 10/356 [29:22<13:53:54, 144.61s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing jobs:  14%|█▍        | 50/356 [2:38:29<20:28:15, 240.84s/it]

Saved progress after processing 50 job descriptions.


Processing jobs:  28%|██▊       | 100/356 [5:35:57<10:10:26, 143.07s/it]

Saved progress after processing 100 job descriptions.


Processing jobs:  36%|███▌      | 127/356 [6:54:56<11:07:40, 174.94s/it]

In [6]:
ssasa

Unnamed: 0.1,Unnamed: 0,title,company,location,job_link,description,skills
0,Unnamed: 0,title,company,location,job_link,description,skills
1,1085,Assistant Director - Energy Innovation (Denver...,State of Colorado,"Denver, CO",https://www.linkedin.com/jobs/view/assistant-d...,Department Information\nOPEN ONLY TO CURRENT R...,"renewable energy, geothermal, sustainability, ..."
2,1086,Field Landman,Scout Energy Partners,"Ulysses, KS",https://www.linkedin.com/jobs/view/field-landm...,Duties & Responsibilities\n Landowner Relation...,ai
3,1087,"Energy Consultant - Dallas, TX",Suntria,"Dallas, TX",https://www.linkedin.com/jobs/view/energy-cons...,Suntria is searching for a passionate and know...,"renewable energy, energy efficiency, sustainab..."
4,1088,"Manager, Clean Energy Asset Management",Meta,United States,https://www.linkedin.com/jobs/view/manager-cle...,The Manager of Clean Energy Asset Management w...,"renewable energy, solar, wind, sustainability,..."
