# Job Skill Analysis Notebook

This notebook allows you to scrape job descriptions for specific roles and analyze the most common skills and keywords. 
It uses `scraper.py` from the JobSniper project.

**Prerequisites**:
- Ensure `clean_and_tokenize` logic is customized if you want to filter different stopwords.

In [None]:
# Install visualization library if missing
%pip install matplotlib seaborn wordcloud

In [None]:
from scraper import scrape_job_boards
import pandas as pd
import re
from collections import Counter
import time
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def clean_and_tokenize(text):
    if not isinstance(text, str):
        return []
    
    # Lowercase
    text = text.lower()
    
    # Replace special separators with space
    text = text.replace('/', ' ').replace(',', ' ').replace('(', ' ').replace(')', ' ')
    
    # Simple tokenization
    tokens = text.split()
    
    # Stopwords list
    stopwords = set([
        'and', 'or', 'the', 'a', 'an', 'in', 'to', 'of', 'for', 'with', 'on', 'at', 'by', 'from', 'as',
        'is', 'are', 'was', 'were', 'be', 'been', 'has', 'have', 'had', 'that', 'this', 'it', 'not',
        'we', 'you', 'your', 'our', 'will', 'can', 'may', 'should', 'would', 'if', 'but', 'so',
        'experience', 'years', 'work', 'job', 'role', 'team', 'skills', 'requirements', 'qualifications',
        'description', 'looking', 'hiring', 'opportunity', 'company', 'business', 'development', 'software',
        'engineer', 'engineering', 'developer', 'systems', 'solutions', 'services', 'technical', 'technology',
        'strong', 'knowledge', 'understanding', 'proficiency', 'ability', 'preferred', 'plus', 'degree',
        'bachelor', 'master', 'computer', 'science', 'related', 'field', 'working', 'environment', 'support',
        'design', 'create', 'build', 'maintain', 'using', 'based', 'across', 'within', 'other', 'new',
        'best', 'practices', 'full', 'time', 'part', 'contract', 'remote', 'location', 'us', 'application',
        'project', 'projects', 'ensure', 'help', 'need', 'seeking', 'join', 'candidates', 'must', 'responsibilities',
        'learning', 'data', 'analysis', 'analytics', 'management', 'product', 'production', 'code', 'coding',
        'platform', 'infrastructure', 'tools', 'technologies', 'needs', 'highly', 'various', 'excellent',
        'communication', 'written', 'verbal', 'collaborate', 'stakeholders', 'solve', 'complex', 'problems',
        'performance', 'quality', 'standards', 'process', 'processes', 'improve', 'improvement', 'grow',
        'growth', 'innovation', 'innovative', 'solutions', 'deliver', 'delivery', 'client', 'clients'
    ])
    
    # Clean tokens
    clean_tokens = []
    for t in tokens:
        # constant cleanup
        t = t.strip('.,;:!?()[]{}"\'')
        if len(t) > 1 and t not in stopwords:
            clean_tokens.append(t)
            
    return clean_tokens

In [None]:
# Configuration
roles = [
    "Machine Learning Engineer",
    "Generative AI Engineer", 
    "Data Scientist", 
    "MLOps Engineer", 
    "Software Engineer"
]

location = "Remote"
num_jobs = 25
hours_old = 72

In [None]:
print(f"--- Starting Skill Analysis ---")
print(f"Roles: {roles}")
print(f"Jobs per role: {num_jobs}")
print("-------------------------------")

all_descriptions = []

for role in roles:
    print(f"Scraping {num_jobs} jobs for '{role}'...")
    try:
        # enable linkedin_fetch_description for better data
        df = scrape_job_boards(
            role=role, 
            location=location, 
            num_jobs=num_jobs, 
            hours_old=hours_old,
            linkedin_fetch_description=True
        )
        
        if not df.empty and 'description' in df.columns:
            descs = df['description'].dropna().tolist()
            print(f"  -> Found {len(df)} jobs. Collected {len(descs)} descriptions.")
            all_descriptions.extend(descs)
        else:
            print(f"  -> No jobs found for {role}.")
        
        # Be nice to APIs
        time.sleep(2)
        
    except Exception as e:
        print(f"  -> Error scraping {role}: {e}")

In [None]:
print(f"Total Descriptions collected: {len(all_descriptions)}")

if all_descriptions:
    # Counter for all words
    word_counts = Counter()
    
    for desc in all_descriptions:
        tokens = clean_and_tokenize(desc)
        word_counts.update(tokens)
        
    top_30 = word_counts.most_common(30)
    
    print("\n--- Top 30 Skills / Keywords ---")
    for i, (word, count) in enumerate(top_30, 1):
        print(f"{i}. {word}: {count}")
else:
    print("No data to analyze.")

In [None]:
# Visualization
if all_descriptions and top_30:
    words, counts = zip(*top_30)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x=list(counts), y=list(words), palette='viridis')
    plt.title(f'Top 30 Skills for {len(roles)} Roles (n={len(all_descriptions)} jobs)')
    plt.xlabel('Frequency')
    plt.ylabel('Skill / Keyword')
    plt.show()