In [None]:
import json
from datetime import datetime


def detect_domain(role):
    role = role.lower()
    for key, domain in ROLE_DOMAINS.items():
        if key in role:
            return domain
    return "general"   # fallback


def filter_keywords_by_domain(role, raw_keywords):
    domain = detect_domain(role)
    allowed = DOMAIN_KEYWORDS.get(domain, DOMAIN_KEYWORDS["general"])

    raw_norm = [(kw, kw.lower()) for kw in raw_keywords]

    filtered = []
    seen = set()

    
    for orig, low in raw_norm:
        if low in allowed and low not in seen:
            seen.add(low)
            filtered.append(orig)

    
    if len(filtered) < 3:
        needed = 3 - len(filtered)
        for kw in allowed:
            if kw not in seen:
                filtered.append(kw.title())
                seen.add(kw)
                needed -= 1
                if needed == 0:
                    break

    
    while len(filtered) < 6:
        for kw in allowed:
            if kw not in seen:
                filtered.append(kw.title())
                seen.add(kw)
                break
        else:
            break

    
    return filtered[:6]






def load_resume(line_number, path="master_resumes.jsonl"):
   
    with open(path, "r") as file:
        line = None
        for _ in range(line_number):
            line = file.readline()
            if not line:
                break
    if not line:
        raise IndexError(f"File has fewer than {line_number} lines.")
    return json.loads(line)



def parse_date(date_str):
    
    if not date_str:
        return None

    cleaned = str(date_str).strip().lower()

    if cleaned in ["unknown", "n/a", "", "none"]:
        return None

    if cleaned in ["present", "current", "now", "today"]:
        return datetime.now()

    for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
        try:
            return datetime.strptime(cleaned, fmt)
        except Exception:
            pass

    return None



def is_valid_tech_role(role):
    if not role:
        return False
    role = role.lower()
    if "advocate" in role:
        return False
   
    return True



def normalize_token(tok):
   
    if not tok:
        return None
    tok = str(tok).strip()
    tok = tok.strip(' .;')          
    return tok if tok else None

def split_commas(tok):
   
    if not tok:
        return []
    if isinstance(tok, str) and "," in tok:
        parts = [p.strip() for p in tok.split(",") if p.strip()]
        return parts
    return [tok]


\
def extract_keywords(resume, limit=6):
    
    seen = set()
    ordered = []

    def add_token(tok):
        if not tok:
            return
        
        for part in split_commas(tok):
            clean = normalize_token(part)
            if not clean:
                continue
            
            key = clean.lower()
            if key not in seen:
                seen.add(key)
                ordered.append(clean)

   
    skills = resume.get("skills", {}).get("technical", {})
    for group in ["programming_languages", "frameworks", "databases", "cloud"]:
        for item in skills.get(group, []):
            if isinstance(item, dict):
                add_token(item.get("name"))
            else:
                add_token(item)


    exps = resume.get("experience", [])
    if exps:
        exp0 = exps[0]
        tech_env = exp0.get("technical_environment", {})
        for group in ["technologies", "tools"]:
            for item in tech_env.get(group, []):
                add_token(item)

   
    for proj in resume.get("projects", []):
        for tech in proj.get("technologies", []):
            add_token(tech)

   
    return ordered[:limit]



def build_example_from_resume(resume):
   
    if not resume or not resume.get("experience"):
        return None

    exp0 = resume["experience"][0]
    role = exp0.get("title", "Unknown Role").strip()
    if not is_valid_tech_role(role):
        return None

    
    start_raw = exp0.get("dates", {}).get("start")
    end_raw   = exp0.get("dates", {}).get("end")

    start_date = parse_date(start_raw)
    end_date   = parse_date(end_raw)

    if start_date and end_date:
        days = (end_date - start_date).days
        years = max(1, round(days / 365))
    else:
        level = exp0.get("level", "").lower()
        if level == "junior":
            years = 1
        elif level == "mid":
            years = 3
        elif level == "senior":
            years = 6
        else:
            
            years = 1

    experience_str = f"{years} years"
    input_str = f"{role} with {years} years experience"

    keywords = extract_keywords(resume, limit=6)

    raw_keywords = extract_keywords(resume)

    clean_keywords = filter_keywords_by_domain(role, raw_keywords)

    output = {
      "role": role,
      "experience": experience_str,
      "keywords": clean_keywords
        }


    return {"input": input_str, "output": output}


ROLE_DOMAINS = {
    "android": "mobile",
    "ios": "mobile",
    "mobile": "mobile",

    "frontend": "web",
    "backend": "web",
    "full stack": "web",
    "web": "web",
    "react": "web",
    "angular": "web",
    "vue": "web",
    "javascript": "web",
    "node": "web",

    "python": "backend",
    "java": "backend",
    "dotnet": "backend",
    "c#": "backend",
    ".net": "backend",

    "data": "data",
    "database": "data",
    "sql": "data",
    "analyst": "data",

    "ml": "ml",
    "machine learning": "ml",
    "deep learning": "ml",
    "ai": "ml",
    "nlp": "ml",

    "security": "security",
    "cyber": "security",
    "network": "security",

    "qa": "testing",
    "tester": "testing",
    "quality": "testing",

    "devops": "devops",
    "cloud": "cloud",

   
    "architect": "backend",
    "engineer": "backend",
}



DOMAIN_KEYWORDS = {
    "web": [
        "javascript", "typescript", "react", "redux", "angular", "vue",
        "html", "css", "sass", "bootstrap",
        "node", "express", "django", "flask",
        "webpack", "git", "next.js", "nuxt", "rest", "graphql"
    ],

    "mobile": [
        "java", "kotlin", "android", "android studio", "gradle", "xml",
        "retrofit", "sqlite", "jetpack", "compose"
    ],

    "backend": [
        "python", "java", "c#", ".net", "golang", "node",
        "flask", "django", "spring", "postgresql", "mysql",
        "rest", "graphql", "docker"
    ],

    "data": [
        "sql", "mysql", "postgresql", "pandas", "numpy",
        "etl", "data pipelines", "tableau", "powerbi"
    ],

    "ml": [
        "tensorflow", "pytorch", "scikit", "keras",
        "nlp", "transformers", "opencv", "huggingface",
        "deep learning", "machine learning"
    ],

    "security": [
        "networking", "vlan", "acl", "acls", "ids", "ips",
        "firewall", "penetration testing", "pentest",
        "vpn", "nat", "subnetting", "supernetting", "wireshark"
    ],

    "testing": [
        "selenium", "pytest", "unittest",
        "cypress", "jenkins", "jmeter",
        "test automation", "qa", "ci/cd"
    ],

    "devops": [
        "docker", "kubernetes", "helm",
        "jenkins", "gitlab", "ci/cd",
        "terraform", "ansible", "prometheus", "grafana"
    ],

    "cloud": [
        "aws", "azure", "gcp",
        "lambda", "s3", "cloudwatch", "ec2"
    ],

    "general": [
        "git", "linux", "shell", "api", "rest",
        "scrum", "agile"
    ]
}

def get_raw_resume_info(resume):
    info = {}

    
    if resume.get("experience"):
        info["role"] = resume["experience"][0].get("title", None)
    else:
        info["role"] = None

    
    if resume.get("experience"):
        dates = resume["experience"][0].get("dates", {})
        info["experience_raw"] = {
            "start": dates.get("start"),
            "end": dates.get("end")
        }
    else:
        info["experience_raw"] = None

    
    info["skills_raw"] = resume.get("skills", {})

    
    info["projects_raw"] = resume.get("projects", [])

    
    if resume.get("experience"):
        info["tech_env_raw"] = resume["experience"][0].get("technical_environment", {})
    else:
        info["tech_env_raw"] = None

    return info




if __name__ == "__main__":
    r = load_resume(4782)          
    example = build_example_from_resume(r)
    if example:
        print("INPUT:", example["input"])
        print("OUTPUT:", json.dumps(example["output"], indent=2))
    else:
        print("Resume #42 skipped (non-tech or incomplete).")

    raw_info = get_raw_resume_info(r)

    print(json.dumps(raw_info, indent=2))


INPUT: Solutions Architect with 6 years experience
OUTPUT: {
  "role": "Solutions Architect",
  "experience": "6 years",
  "keywords": [
    "Python",
    "Java",
    "C#",
    ".Net",
    "Golang",
    "Node"
  ]
}
{
  "role": "Solutions Architect",
  "experience_raw": {
    "start": "2018-04-03",
    "end": "2024-04-03"
  },
  "skills_raw": {
    "technical": {
      "programming_languages": [
        {
          "name": "C++",
          "level": "beginner"
        },
        {
          "name": "Python",
          "level": "intermediate"
        }
      ],
      "frameworks": [
        {
          "name": "Vue",
          "level": "beginner"
        },
        {
          "name": "React",
          "level": "intermediate"
        }
      ],
      "databases": [
        {
          "name": "MongoDB",
          "level": "intermediate"
        },
        {
          "name": "Redis",
          "level": "expert"
        }
      ],
      "cloud": [
        {
          "name": "Azure",
   

In [None]:
import json

roles = set()

with open("master_resumes.jsonl", "r") as file:
    for line in file:
        if not line.strip():
            continue

        resume = json.loads(line)

        
        experience = resume.get("experience", [])
        if not experience:
            continue

        
        title = experience[0].get("title", "").strip()

        if title:
            roles.add(title)


for r in sorted(roles):
    print(r)


Adjunct Faculty & Data Scientist
Advocate
Ai Engineer
Android Developer
Angular Developer
Automation Engineer
Backend Developer
Blockchain Developer
Business Analyst
Cloud Engineer
Cloud Operations Architect (DevOps)
Computer Vision Engineer
Cybersecurity Engineer
Data Engineer
Data Science Consultant
Data Scientist
Database Administrator
Database Engineer
Deep Learning Engineer
DevOps Engineer
Devops Engineer
Electrical Engineer
Embedded Systems Engineer
Flutter Developer
Frontend Developer
Full Stack Developer
Information Security Analyst
Infrastructure Engineer
Ios Developer
Java Developer
Java Web Developer
Javascript Developer
Jr. Java Developer
Kubernetes Engineer
Machine Learning Engineer
Machine Learning Engineer Intern
Mlops Engineer
Mobile Developer
Network Security Engineer
Network and Security Engineer
Nlp Engineer
Node.Js Developer
Nosql Developer
Operations Manager
Penetration Tester
Platform Engineer
Project Manager
Python API Developer
Python Developer
Python Developer/

In [None]:
import json

def load_all_resumes(path):
    resumes = []
    with open(path, "r") as file:
        for line in file:
            line = line.strip()
            if not line:
                continue
            try:
                resumes.append(json.loads(line))
            except:
                pass
    return resumes


resumes = load_all_resumes("master_resumes.jsonl")

print("Total resumes loaded:", len(resumes))


Total resumes loaded: 4817


In [None]:
import json
from datetime import datetime

def load_resume(line_number, path="master_resumes.jsonl"):
    
    with open(path, "r") as file:
        line = None
        for _ in range(line_number):
            line = file.readline()
            if not line:
                break
    if not line:
        raise IndexError(f"File has fewer than {line_number} lines.")
    return json.loads(line)



def parse_date(date_str):
    if not date_str:
        return None

    cleaned = str(date_str).strip().lower()

    if cleaned in ["unknown", "n/a", "", "none"]:
        return None

    if cleaned in ["present", "current", "now", "today"]:
        return datetime.now()

    for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
        try:
            return datetime.strptime(cleaned, fmt)
        except Exception:
            pass

    return None



def is_valid_tech_role(role):
    if not role:
        return False
    role = role.lower()
    if "advocate" in role:
        return False
    
    return True



def normalize_token(tok):
    if not tok:
        return None
    tok = str(tok).strip()
    tok = tok.strip(' .;')          
    return tok if tok else None

def split_commas(tok):
    if not tok:
        return []
    if isinstance(tok, str) and "," in tok:
        parts = [p.strip() for p in tok.split(",") if p.strip()]
        return parts
    return [tok]



def extract_keywords(resume):

    seen = set()
    ordered = []

    def add_token(tok):
        if not tok:
            return
        for part in split_commas(tok):
            clean = normalize_token(part)
            if not clean:
                continue
            key = clean.lower()
            if key not in seen:
                seen.add(key)
                ordered.append(clean)


    skills = resume.get("skills", {}).get("technical", {})
    for group in ["programming_languages", "frameworks", "databases", "cloud"]:
        for item in skills.get(group, []):
            if isinstance(item, dict):
                add_token(item.get("name"))
            else:
                add_token(item)

    
    for exp in resume.get("experience", []):
        tech_env = exp.get("technical_environment", {})
        for group in ["technologies", "tools"]:
            for item in tech_env.get(group, []):
                add_token(item)

   
    for proj in resume.get("projects", []):
        for item in proj.get("technologies", []):
            add_token(item)

    return ordered





def build_example_from_resume(resume):
    if not resume or not resume.get("experience"):
        return None

    exp0 = resume["experience"][0]
    role = exp0.get("title", "Unknown Role").strip()
    if not is_valid_tech_role(role):
        return None

    start_raw = exp0.get("dates", {}).get("start")
    end_raw   = exp0.get("dates", {}).get("end")

    start_date = parse_date(start_raw)
    end_date   = parse_date(end_raw)

    if start_date and end_date:
        days = (end_date - start_date).days
        years = max(1, round(days / 365))
    else:
        level = exp0.get("level", "").lower()
        if level == "junior":
            years = 1
        elif level == "mid":
            years = 3
        elif level == "senior":
            years = 6
        else:
            
            years = 1

    experience_str = f"{years} years"
    input_str = f"{role} with {years} years experience"

    keywords = extract_keywords(resume)

    output = {
        "role": role,
        "experience": experience_str,
        "keywords": keywords
    }

    return {"input": input_str, "output": output}



if __name__ == "__main__":
    r = load_resume(1569)          # 1-based: 1 => first line
    example = build_example_from_resume(r)
    if example:
        print("INPUT:", example["input"])
        print("OUTPUT:", json.dumps(example["output"], indent=2))
    else:
        print("Resume #42 skipped (non-tech or incomplete).")


INPUT: Web Developer with 4 years experience
OUTPUT: {
  "role": "Web Developer",
  "experience": "4 years",
  "keywords": [
    "Ruby",
    "JavaScript",
    "Django",
    "MongoDB",
    "Redis",
    "Google Cloud",
    "Kubernetes",
    "Git",
    "Jenkins",
    "Python",
    "CI/CD"
  ]
}


In [None]:
import json
from datetime import datetime


def load_resume(line_number, path="master_resumes.jsonl"):
    with open(path, "r") as file:
        line = None
        for _ in range(line_number):
            line = file.readline()
            if not line:
                break
    if not line:
        raise IndexError(f"File has fewer than {line_number} lines.")
    return json.loads(line)



def load_all_resumes(path="master_resumes.jsonl"):
    resumes = []
    with open(path, "r") as file:
        for line in file:
            try:
                resumes.append(json.loads(line))
            except:
                pass
    return resumes



def parse_date(date_str):
    if not date_str:
        return None
    cleaned = str(date_str).strip().lower()
    if cleaned in ["unknown", "n/a", "", "none"]:
        return None
    if cleaned in ["present", "current", "now", "today"]:
        return datetime.now()
    for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
        try:
            return datetime.strptime(cleaned, fmt)
        except:
            pass
    return None



def is_valid_tech_role(role):
    if not role:
        return False
    if "advocate" in role.lower():
        return False
    return True


def normalize_token(tok):
    if not tok:
        return None
    tok = str(tok).strip().strip(" .;")
    return tok or None


def split_commas(tok):
    if not tok:
        return []
    if isinstance(tok, str) and "," in tok:
        return [p.strip() for p in tok.split(",") if p.strip()]
    return [tok]



def extract_keywords(resume):
    seen = set()
    ordered = []

    def add(tok):
        if not tok:
            return
        for part in split_commas(tok):
            clean = normalize_token(part)
            if clean:
                key = clean.lower()
                if key not in seen:
                    seen.add(key)
                    ordered.append(clean)

    skills = resume.get("skills", {}).get("technical", {})
    for group in ["programming_languages", "frameworks", "databases", "cloud"]:
        for item in skills.get(group, []):
            if isinstance(item, dict):
                add(item.get("name"))
            else:
                add(item)

    for exp in resume.get("experience", []):
        tech_env = exp.get("technical_environment", {})
        for group in ["technologies", "tools"]:
            for item in tech_env.get(group, []):
                add(item)

 
    for proj in resume.get("projects", []):
        for tech in proj.get("technologies", []):
            add(tech)

    return ordered



def build_example_from_resume(resume):
    if not resume or not resume.get("experience"):
        return None

    exp0 = resume["experience"][0]
    role = exp0.get("title", "").strip()

    if not is_valid_tech_role(role):
        return None

  
    date_info = exp0.get("dates", {})
    start = parse_date(date_info.get("start"))
    end = parse_date(date_info.get("end"))

    if start and end:
        delta_days = (end - start).days
        years = max(1, round(delta_days / 365))
    else:
        level = exp0.get("level", "").lower()
        years = {"junior": 1, "mid": 3, "senior": 6}.get(level, 1)

    experience_str = f"{years} years"
    input_str = f"{role} with {years} years experience"

    keywords = extract_keywords(resume)

    return {
        "input": input_str,
        "output": {
            "role": role,
            "experience": experience_str,
            "keywords": keywords
        }
    }



def normalize_role(role):
    role = role.strip().lower()
    while "  " in role:
        role = role.replace("  ", " ")
    return role


def extract_years(exp_str):
    for token in exp_str.split():
        if token.isdigit():
            return int(token)
    return None


def make_key(example):
    role = normalize_role(example["output"]["role"])
    years = extract_years(example["output"]["experience"])
    return (role, years)


def merge_keywords(list1, list2):
    seen = set()
    merged = []
    for item in list1 + list2:
        key = item.strip().lower()
        if key not in seen:
            seen.add(key)
            merged.append(item)
    return merged


def dedupe_and_merge(examples):
    result = {}
    for ex in examples:
        key = make_key(ex)
        if key not in result:
            result[key] = ex
        else:
            old_kw = result[key]["output"]["keywords"]
            new_kw = ex["output"]["keywords"]
            result[key]["output"]["keywords"] = merge_keywords(old_kw, new_kw)
    return list(result.values())



if __name__ == "__main__":
    resumes = load_all_resumes("master_resumes.jsonl")

    all_examples = []
    for r in resumes:
        ex = build_example_from_resume(r)
        if ex:
            all_examples.append(ex)

    final_dataset = dedupe_and_merge(all_examples)

    with open("final_clean_dataset.jsonl", "w") as f:
        for item in final_dataset:
            f.write(json.dumps(item) + "\n")

    print("DONE — final_clean_dataset.jsonl created successfully!")


DONE — final_clean_dataset.jsonl created successfully!


In [None]:
import json

def extract_all_roles(path="final_clean_dataset.jsonl"):
    roles = set()
    with open(path, "r") as f:
        for line in f:
            try:
                obj = json.loads(line)
                role = obj["output"]["role"].strip().lower()
                roles.add(role)
            except:
                pass
    return sorted(roles)

roles = extract_all_roles("final_clean_dataset.jsonl")

for r in roles:
    print(r)


adjunct faculty & data scientist
ai engineer
android developer
angular developer
automation engineer
backend developer
blockchain developer
cloud engineer
cloud operations architect (devops)
computer vision engineer
cybersecurity engineer
data engineer
data science consultant
data scientist
database administrator
database engineer
deep learning engineer
devops engineer
electrical engineer
embedded systems engineer
flutter developer
frontend developer
full stack developer
information security analyst
infrastructure engineer
ios developer
java developer
java web developer
javascript developer
jr. java developer
kubernetes engineer
machine learning engineer
machine learning engineer intern
mlops engineer
mobile developer
network and security engineer
network security engineer
nlp engineer
node.js developer
nosql developer
operations manager
penetration tester
platform engineer
project manager
python api developer
python developer
python developer/analyst
python restful api developer
qa en

In [None]:
import json

MASTER_SKILLS = {
    "adjunct faculty & data scientist": {
        "junior": ["python", "pandas", "matplotlib", "sql"],
        "mid": ["scikit-learn", "feature engineering", "numpy", "data preprocessing"],
        "senior": ["machine learning pipelines", "model deployment", "data warehousing", "deep learning"]
    },

    "ai engineer": {
        "junior": ["python", "numpy", "tensorflow basics", "data preprocessing"],
        "mid": ["pytorch", "model optimization", "opencv", "transformers"],
        "senior": ["distributed training", "model quantization", "ml pipelines", "reinforcement learning"]
    },

    "android developer": {
        "junior": ["kotlin", "android studio", "xml layouts", "retrofit"],
        "mid": ["room database", "coroutines", "jetpack components", "firebase"],
        "senior": ["android architecture components", "jetpack compose", "modularization", "performance tuning"]
    },

    "angular developer": {
        "junior": ["angular basics", "typescript", "html/css", "rxjs"],
        "mid": ["angular routing", "ngrx", "rest integration", "form builder"],
        "senior": ["state management", "lazy loading", "architecture patterns", "micro frontends"]
    },

    "automation engineer": {
        "junior": ["python", "bash scripting", "webdriver", "selenium basics"],
        "mid": ["jenkins pipelines", "api automation", "pytest", "docker"],
        "senior": ["automation frameworks", "infrastructure automation", "kubernetes", "ci pipeline design"]
    },

    "backend developer": {
        "junior": ["python", "django basics", "sql", "rest apis"],
        "mid": ["postgresql", "redis", "api authentication", "fastapi"],
        "senior": ["system design", "scalability patterns", "microservices", "docker orchestration"]
    },

    "blockchain developer": {
        "junior": ["solidity", "ethereum basics", "web3.js", "smart contracts"],
        "mid": ["nft standards", "truffle", "ganache", "contract testing"],
        "senior": ["layer 2 scaling", "tokenomics", "protocol design", "security auditing"]
    },

    "cloud engineer": {
        "junior": ["aws ec2", "s3", "linux", "iam basics"],
        "mid": ["lambda", "cloudformation", "load balancers", "vpc networking"],
        "senior": ["multi-cloud design", "infrastructure automation", "kubernetes clusters", "cost optimization"]
    },

    "cloud operations architect (devops)": {
        "junior": ["docker", "linux", "git", "basic ci/cd"],
        "mid": ["kubernetes", "jenkins", "aws services", "terraform"],
        "senior": ["infrastructure architecture", "scalable pipelines", "multi-region deployments", "monitoring systems"]
    },

    "computer vision engineer": {
        "junior": ["opencv", "numpy", "python", "image preprocessing"],
        "mid": ["cnn models", "pytorch", "augmentation pipelines", "segmentation"],
        "senior": ["large vision models", "3d vision", "real-time inference", "edge deployment"]
    },

    "cybersecurity engineer": {
        "junior": ["wireshark", "linux security", "network scanning", "firewall basics"],
        "mid": ["ids/ips", "penetration testing", "log analysis", "threat modeling"],
        "senior": ["siem systems", "zero trust", "incident response", "security automation"]
    },

    "data engineer": {
        "junior": ["python", "sql", "pandas", "etl basics"],
        "mid": ["airflow", "spark", "bigquery", "data modeling"],
        "senior": ["distributed systems", "data lake design", "pipeline optimization", "cloud data platforms"]
    },

    "data science consultant": {
        "junior": ["python", "numpy", "eda", "sql"],
        "mid": ["feature engineering", "model selection", "pandas", "time series"],
        "senior": ["ml deployment", "business ml strategy", "optimization models", "large datasets"]
    },

    "data scientist": {
        "junior": ["python", "pandas", "scikit-learn", "data cleaning"],
        "mid": ["ml algorithms", "model tuning", "feature importance", "xgboost"],
        "senior": ["ml architecture", "deep learning", "pipeline automation", "generative ai"]
    },

    "database administrator": {
        "junior": ["mysql", "backup procedures", "sql queries", "indexes"],
        "mid": ["postgresql", "performance tuning", "replication", "monitoring tools"],
        "senior": ["sharding", "db security", "disaster recovery", "high availability"]
    },

    "database engineer": {
        "junior": ["sql", "mysql basics", "schema design", "stored procedures"],
        "mid": ["postgresql", "query optimization", "etl pipeline", "indexing strategies"],
        "senior": ["distributed databases", "replication design", "data modeling", "scalable storage"]
    },

    "deep learning engineer": {
        "junior": ["tensorflow", "numpy", "cnn basics", "python"],
        "mid": ["pytorch", "transformers", "training optimization", "rnn models"],
        "senior": ["distributed training", "large model fine-tuning", "mlops integration", "model optimization"]
    },

    "devops engineer": {
        "junior": ["docker", "linux", "git", "basic ci/cd"],
        "mid": ["kubernetes", "aws ec2", "jenkins", "terraform"],
        "senior": ["scalable ci pipelines", "infrastructure design", "multi-cloud orchestration", "monitoring setup"]
    },

    "electrical engineer": {
        "junior": ["matlab", "c programming", "circuit design", "pcb basics"],
        "mid": ["fpga", "embedded c", "signal processing", "simulation tools"],
        "senior": ["system integration", "hardware optimization", "power electronics", "embedded architecture"]
    },

    "embedded systems engineer": {
        "junior": ["c", "embedded c", "microcontrollers", "uart/spi/i2c"],
        "mid": ["rtos", "firmware debugging", "bootloaders", "embedded linux"],
        "senior": ["system architecture", "board bring-up", "performance tuning", "hardware/software integration"]
    },

    "flutter developer": {
        "junior": ["dart", "flutter widgets", "http client", "stateful widgets"],
        "mid": ["provider", "bloc", "api integration", "firebase"],
        "senior": ["custom renderers", "app architecture", "isolate optimization", "advanced state management"]
    },

    "frontend developer": {
        "junior": ["html", "css", "javascript", "basic react"],
        "mid": ["redux", "typescript", "api integration", "component architecture"],
        "senior": ["frontend optimization", "micro frontends", "scalable design systems", "performance profiling"]
    },

    "full stack developer": {
        "junior": ["javascript", "node.js", "html/css", "sql"],
        "mid": ["express", "react", "rest apis", "postgresql"],
        "senior": ["system design", "microservices", "cloud deployments", "scalable backend"]
    },

    "information security analyst": {
        "junior": ["network basics", "firewall rules", "vulnerability scanning", "log analysis"],
        "mid": ["ids/ips", "malware analysis", "incident triage", "forensics tools"],
        "senior": ["security architecture", "threat hunting", "siem management", "zero trust design"]
    },

    "infrastructure engineer": {
        "junior": ["linux", "bash scripting", "network basics", "docker"],
        "mid": ["kubernetes", "load balancers", "terraform", "monitoring tools"],
        "senior": ["infrastructure design", "multi-region networking", "system scalability", "cloud provisioning"]
    },

    "ios developer": {
        "junior": ["swift", "xcode", "uikit", "storyboards"],
        "mid": ["swiftui", "combine", "core data", "api integration"],
        "senior": ["modular architecture", "performance profiling", "concurrency", "framework development"]
    },

    "java developer": {
        "junior": ["java", "spring basics", "hibernate", "sql"],
        "mid": ["spring boot", "rest apis", "junit", "kafka"],
        "senior": ["microservices", "distributed systems", "system design", "cloud deployment"]
    },

    "java web developer": {
        "junior": ["java", "servlets", "jsp", "mysql"],
        "mid": ["spring mvc", "api development", "tomcat", "rest"],
        "senior": ["spring boot microservices", "distributed design", "cloud packaging", "performance optimization"]
    },

    "javascript developer": {
        "junior": ["javascript", "html/css", "dom manipulation", "fetch api"],
        "mid": ["react", "typescript", "webpack", "api integration"],
        "senior": ["frontend architecture", "performance tuning", "micro frontends", "state management"]
    },

    "jr. java developer": {
        "junior": ["java", "junit", "jdbc", "basic spring"],
        "mid": ["spring boot", "rest apis", "postgresql", "maven"],
        "senior": ["microservices", "system design", "distributed architecture", "cloud deployment"]
    },

    "kubernetes engineer": {
        "junior": ["docker", "kubernetes basics", "pods/services", "yaml configs"],
        "mid": ["helm charts", "ingress", "k8s monitoring", "statefulsets"],
        "senior": ["cluster architecture", "autoscaling", "service mesh", "advanced networking"]
    },

    "machine learning engineer": {
        "junior": ["python", "numpy", "scikit-learn", "data preprocessing"],
        "mid": ["pytorch", "tensorflow", "feature engineering", "ml pipelines"],
        "senior": ["distributed training", "model optimization", "mlops", "large model tuning"]
    },

    "machine learning engineer intern": {
        "junior": ["python", "numpy", "pandas", "basic ml models"],
        "mid": ["data cleaning", "feature engineering", "scikit-learn", "model evaluation"],
        "senior": ["advanced ml workflows", "deployment basics", "pytorch", "tensorflow"]
    },

    "mlops engineer": {
        "junior": ["docker", "python", "linux", "git"],
        "mid": ["airflow", "kubernetes", "mlflow", "jenkins"],
        "senior": ["ml pipelines", "distributed training", "cloud ml platforms", "model monitoring"]
    },

    "mobile developer": {
        "junior": ["kotlin", "swift", "rest apis", "ui layouts"],
        "mid": ["flutter", "react native", "local databases", "firebase"],
        "senior": ["multi-platform optimization", "app architecture", "performance profiling", "security hardening"]
    },

    "network and security engineer": {
        "junior": ["subnetting", "firewalls", "routing basics", "linux networking"],
        "mid": ["vpn", "ids/ips", "fortinet", "packet inspection"],
        "senior": ["network architecture", "zero trust", "high availability design", "cloud networking"]
    },

    "network security engineer": {
        "junior": ["vlan", "access lists", "tcp/ip basics", "wireshark"],
        "mid": ["firewall rules", "ids/ips", "vpn configuration", "routing protocols"],
        "senior": ["network hardening", "siem integration", "advanced firewalling", "security architecture"]
    },

    "nlp engineer": {
        "junior": ["python", "nltk", "regex", "text preprocessing"],
        "mid": ["transformers", "huggingface", "tokenization", "embeddings"],
        "senior": ["llm fine-tuning", "retrieval pipelines", "model optimization", "deployment systems"]
    },

    "node.js developer": {
        "junior": ["node.js", "express basics", "javascript", "mongo basics"],
        "mid": ["jwt auth", "redis caching", "rest apis", "typescript"],
        "senior": ["microservices", "message queues", "scalable server design", "cloud packaging"]
    },

    "nosql developer": {
        "junior": ["mongodb", "redis basics", "document modeling", "queries"],
        "mid": ["replication", "sharding basics", "cassandra", "indexing"],
        "senior": ["distributed databases", "performance tuning", "cluster design", "storage optimization"]
    },

    "operations manager": {
        "junior": ["excel", "sql basics", "dashboards", "report automation"],
        "mid": ["data workflows", "api integrations", "python scripts", "etl basics"],
        "senior": ["system automation", "infrastructure monitoring", "cloud tools", "process optimization"]
    },

    "penetration tester": {
        "junior": ["nmap", "burpsuite basics", "linux", "recon tools"],
        "mid": ["metasploit", "web exploitation", "network attacks", "password auditing"],
        "senior": ["red teaming", "post exploitation", "exploit writing", "advanced security testing"]
    },

    "platform engineer": {
        "junior": ["docker", "linux", "github actions", "rest apis"],
        "mid": ["kubernetes", "terraform", "monitoring tools", "network configs"],
        "senior": ["platform architecture", "scalable infra", "multi-cloud design", "advanced automation"]
    },

    "project manager": {
        "junior": ["jira setup", "sql basics", "api understanding", "excel"],
        "mid": ["kpi tracking", "workflow automation", "integration tools", "dashboards"],
        "senior": ["system oversight", "architecture understanding", "deployment planning", "release pipelines"]
    },

    "python api developer": {
        "junior": ["python", "flask basics", "rest apis", "json handling"],
        "mid": ["fastapi", "api authentication", "postgresql", "redis"],
        "senior": ["microservices", "scalable apis", "cloud deployments", "performance tuning"]
    },

    "python developer": {
        "junior": ["python", "django basics", "sqlite", "rest integration"],
        "mid": ["django orm", "asgi", "postgresql", "celery"],
        "senior": ["distributed tasks", "cloud packaging", "scalable backend", "system design"]
    },

    "python developer/analyst": {
        "junior": ["python", "pandas", "matplotlib", "basic sql"],
        "mid": ["etl scripts", "api consumption", "data modeling", "numpy"],
        "senior": ["pipeline automation", "cloud analytics", "large dataset handling", "ml integration"]
    },

    "python restful api developer": {
        "junior": ["python", "flask", "json serialization", "sqlite"],
        "mid": ["fastapi", "api routing", "postgresql", "jwt auth"],
        "senior": ["microservices", "load balancing", "cloud deployments", "api optimization"]
    },

    "qa engineer": {
        "junior": ["selenium", "webdriver", "test cases", "pytest basics"],
        "mid": ["api testing", "automation frameworks", "jenkins", "postman"],
        "senior": ["performance testing", "test architecture", "load testing", "ci automation"]
    },

    "react developer": {
        "junior": ["react basics", "javascript", "html/css", "axios"],
        "mid": ["redux", "component architecture", "typescript", "api integration"],
        "senior": ["scalable frontend", "performance tuning", "micro frontends", "state optimization"]
    },

    "react native developer": {
        "junior": ["react native basics", "javascript", "components", "api usage"],
        "mid": ["navigation", "state management", "typescript", "local storage"],
        "senior": ["native modules", "performance tuning", "app architecture", "multi-platform support"]
    },

    "sap technical architect": {
        "junior": ["abap basics", "sap gui", "sql", "report creation"],
        "mid": ["sap hana", "data modeling", "integration", "bapi"],
        "senior": ["sap landscape design", "performance tuning", "cloud integration", "architecture planning"]
    },

    "security engineer": {
        "junior": ["linux", "tcp/ip", "firewalls", "wireshark"],
        "mid": ["ids/ips", "threat detection", "log analysis", "network monitoring"],
        "senior": ["security design", "siem systems", "incident response", "zero trust"]
    },

    "senior business analyst - rpa": {
        "junior": ["excel", "sql basics", "process mapping", "automation basics"],
        "mid": ["uipath", "api consumption", "workflow rules", "bot design"],
        "senior": ["automation architecture", "rpa scaling", "cloud rpa", "advanced workflows"]
    },

    "site reliability engineer": {
        "junior": ["linux", "docker", "basic monitoring", "bash"],
        "mid": ["kubernetes", "prometheus", "grafana", "ci/cd"],
        "senior": ["scalable systems", "incident automation", "observability design", "chaos testing"]
    },

    "software engineer": {
        "junior": ["python", "rest apis", "sql", "javascript"],
        "mid": ["react", "django", "node.js", "postgresql"],
        "senior": ["system design", "microservices", "cloud deployments", "optimization"]
    },

    "software testing & automation engineer": {
        "junior": ["selenium", "python", "test cases", "webdriver"],
        "mid": ["api automation", "pytest", "jenkins", "postman"],
        "senior": ["framework design", "performance testing", "ci automation", "scalable tests"]
    },

    "solutions architect": {
        "junior": ["api basics", "databases", "python", "cloud fundamentals"],
        "mid": ["system integration", "distributed apps", "rest apis", "deployment"],
        "senior": ["system design", "microservices", "cloud architecture", "scalability planning"]
    },

    "sql developer": {
        "junior": ["mysql", "joins", "indexes", "queries"],
        "mid": ["postgresql", "procedures", "optimization", "data modeling"],
        "senior": ["distributed sql", "query tuning", "warehouse design", "pipeline optimization"]
    },

    "systems engineer": {
        "junior": ["linux", "bash", "network basics", "docker"],
        "mid": ["kubernetes", "vmware", "monitoring tools", "load balancers"],
        "senior": ["system design", "cloud infra", "scalable architectures", "security hardening"]
    },

    "technical architect": {
        "junior": ["basic design patterns", "api concepts", "databases", "python"],
        "mid": ["system design", "microservices", "api architecture", "cloud basics"],
        "senior": ["enterprise architecture", "cloud systems", "scalable design", "integration design"]
    },

    "vue developer": {
        "junior": ["vue basics", "javascript", "html/css", "vuex"],
        "mid": ["component design", "routing", "api integration", "vuetify"],
        "senior": ["scalable apps", "performance tuning", "micro frontends", "architecture patterns"]
    },

    "web developer": {
        "junior": ["html", "css", "javascript", "bootstrap"],
        "mid": ["rest apis", "react", "node.js", "api integration"],
        "senior": ["frontend optimization", "backend integration", "scalable systems", "deployment pipelines"]
    }
}


def get_level_from_experience(exp_str):

    years = int(exp_str.split()[0])

    if years <= 2:
        return "junior"
    elif years <= 5:
        return "mid"
    else:
        return "senior"



def filter_and_supplement(role, level, raw_keywords):
    role_key = role.lower().strip()

    if role_key not in MASTER_SKILLS:
        
        return raw_keywords[:10]

    master = MASTER_SKILLS[role_key][level]
    master_lower = [m.lower() for m in master]

    
    filtered = [kw for kw in raw_keywords if kw.lower() in master_lower]

 
    missing = [kw for kw in master if kw.lower() not in [x.lower() for x in filtered]]

   
    supplement = missing[:4]

    final_list = filtered + supplement

    seen = set()
    cleaned = []
    for kw in final_list:
        if kw.lower() not in seen:
            cleaned.append(kw)
            seen.add(kw.lower())

    return cleaned[:10]  



def build_final_dataset(input_path="final_clean_dataset.jsonl",
                        output_path="final_lora_dataset.jsonl"):

    with open(input_path, "r") as infile, open(output_path, "w") as outfile:

        for line in infile:
            obj = json.loads(line)

            role = obj["output"]["role"]
            exp = obj["output"]["experience"]
            raw_keywords = obj["output"]["keywords"]

            
            level = get_level_from_experience(exp)

           
            final_keywords = filter_and_supplement(role, level, raw_keywords)

           
            cleaned = {
                "input": obj["input"],
                "output": {
                    "role": role,
                    "experience": exp,
                    "keywords": final_keywords
                }
            }

            outfile.write(json.dumps(cleaned) + "\n")

    print("Final LoRA dataset saved →", output_path)



if __name__ == "__main__":
    build_final_dataset()


✅ Final LoRA dataset saved → final_lora_dataset.jsonl


In [None]:
with open("final_lora_dataset.jsonl") as f:
    for i, line in enumerate(f, start=1):
        try:
            json.loads(line)
        except Exception as e:
            print(f"Error on line {i}: {e}")
            print("Line content:", line)
            break


Error on line 1: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
Line content: {



In [None]:
import json

INPUT_PATH = "final_lora_dataset.jsonl"
OUTPUT_PATH = "lora_messages_dataset.jsonl"


def extract_json_objects(text):
   
    objs = []
    brace_count = 0
    current = []

    for ch in text:
        if ch == '{':
            brace_count += 1

        if brace_count > 0:
            current.append(ch)

        if ch == '}':
            brace_count -= 1
            if brace_count == 0:
                
                obj_text = ''.join(current)
                objs.append(obj_text)
                current = []

    return objs


def build_assistant_text(output_obj):
    role = output_obj.get("role", "")
    experience = output_obj.get("experience", "")
    keywords = output_obj.get("keywords", [])
    return (
        f"role: {role}\n"
        f"experience: {experience}\n"
        f"keywords: {', '.join(keywords)}"
    )


def convert_dataset():
    print("Reading dataset...")
    with open(INPUT_PATH, "r", encoding="utf-8") as f:
        raw = f.read()

    print("Extracting JSON objects (brace counting)...")
    objs = extract_json_objects(raw)
    print(f"Found {len(objs)} JSON objects.")

    out = open(OUTPUT_PATH, "w", encoding="utf-8")
    count = 0
    skipped = 0

    for obj_text in objs:
        try:
            obj = json.loads(obj_text)
        except Exception as e:
            skipped += 1
            continue

        user_input = obj.get("input", "")
        output_obj = obj.get("output", {})
        assistant_text = build_assistant_text(output_obj)

        chat = {
            "messages": [
                {"role": "user", "content": user_input},
                {"role": "assistant", "content": assistant_text}
            ]
        }

        out.write(json.dumps(chat) + "\n")
        count += 1

    out.close()

    print("\nConversion complete!")
    print(f"✔ Valid objects converted: {count}")
    print(f"✖ Invalid objects skipped: {skipped}")
    print(f"Output file created: {OUTPUT_PATH}")


if __name__ == "__main__":
    convert_dataset()


Reading dataset...
Extracting JSON objects (brace counting)...
Found 324 JSON objects.

Conversion complete!
✔ Valid objects converted: 324
✖ Invalid objects skipped: 0
Output file created: lora_messages_dataset.jsonl
