In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv(r"C:\Users\Priyansh Singh\Desktop\Job Market\data\processed\jobs_for_analysis.csv")

In [3]:
cols_to_drop = ["job_id", "source", "url", "job_numeric_id"]
df = df.drop(columns=cols_to_drop, errors="ignore")

In [4]:
df

Unnamed: 0,title,company,posted_date,employment_type,description,salary_avg,city,seniority_level,clean_role
0,data analyst predictive analytics india x d...,Boston Consulting Group,2025-12-01,full-time,Locations: Bengaluru | Gurgaon\n\nWho We Are\n...,3.0,Gurugram,mid,"analytics engineer, data analyst"
1,data analyst trainee,SparkNovaTechSolutions,2025-12-02,full-time,ABOUT THE ROLE\n\nWe are seeking a highly moti...,7.0,Jaipur,mid,"ai engineer, analytics engineer, data analyst,..."
2,sr. data analyst adtech/omnichannel remote ...,Jobgether,2025-12-01,full-time,"Sr. Data Analyst, AdTech/Omnichannel (Remote, ...",1.0,IN,senior,"analytics engineer, data analyst"
3,data analyst ecommerce & d2c insights,Wholsum Foods (Slurrp Farm and Mille),2025-12-02,full-time,Job Description: Data Analyst - Ecommerce & D2...,4.5,Gurugram,mid,"analytics engineer, data analyst"
4,senior data analyst payments,Roku,,full-time,Teamwork makes the stream work.\n\nRoku is cha...,1.0,Bengaluru,senior,"analytics engineer, data analyst"
...,...,...,...,...,...,...,...,...,...
1382,data engineer python pyspark panda,LTIMindtree,2025-12-01,full-time,**Role Overview:**\n\nAs a Data Engineer with ...,7.5,Hyderabad,mid,data engineer
1383,lead tech python infra,D. E. Shaw India,2025-12-03,full-time,We are looking for a Software Developer to joi...,7.0,Hyderabad,lead,other
1384,senior python developer,Luxoft,2025-11-30,full-time,Project description\n\nWe're seeking a strong ...,6.0,Hyderabad,senior,python developer
1385,python developer with dagster,Staffington Global,2025-11-25,full-time,Job Description:-\n\nJob Role: Python Develope...,5.0,Hyderabad,mid,python developer


In [5]:
def clean_title_func(text: str) -> str:
    if pd.isna(text):
        return ""
    t = str(text).lower()
    # normalize common seniority abbreviations
    t = t.replace("sr.", " senior ")
    t = t.replace("sr ", " senior ")
    t = t.replace("jr.", " junior ")
    t = t.replace("jr ", " junior ")
    
    # remove common noise words that look like location / channel / misc
    remove_words = [
        "india", "remote", "hybrid", "x delivery", "delivery",
        "adtech", "omnichannel", "ecommerce", "insights"
    ]
    for w in remove_words:
        t = t.replace(w, " ")
    
    # collapse spaces
    t = " ".join(t.split())
    # Title case
    return t.title()

df["clean_title"] = df["title"].astype(str).apply(clean_title_func)

In [6]:
df

Unnamed: 0,title,company,posted_date,employment_type,description,salary_avg,city,seniority_level,clean_role,clean_title
0,data analyst predictive analytics india x d...,Boston Consulting Group,2025-12-01,full-time,Locations: Bengaluru | Gurgaon\n\nWho We Are\n...,3.0,Gurugram,mid,"analytics engineer, data analyst",Data Analyst Predictive Analytics
1,data analyst trainee,SparkNovaTechSolutions,2025-12-02,full-time,ABOUT THE ROLE\n\nWe are seeking a highly moti...,7.0,Jaipur,mid,"ai engineer, analytics engineer, data analyst,...",Data Analyst Trainee
2,sr. data analyst adtech/omnichannel remote ...,Jobgether,2025-12-01,full-time,"Sr. Data Analyst, AdTech/Omnichannel (Remote, ...",1.0,IN,senior,"analytics engineer, data analyst",Senior Data Analyst /
3,data analyst ecommerce & d2c insights,Wholsum Foods (Slurrp Farm and Mille),2025-12-02,full-time,Job Description: Data Analyst - Ecommerce & D2...,4.5,Gurugram,mid,"analytics engineer, data analyst",Data Analyst & D2C
4,senior data analyst payments,Roku,,full-time,Teamwork makes the stream work.\n\nRoku is cha...,1.0,Bengaluru,senior,"analytics engineer, data analyst",Senior Data Analyst Payments
...,...,...,...,...,...,...,...,...,...,...
1382,data engineer python pyspark panda,LTIMindtree,2025-12-01,full-time,**Role Overview:**\n\nAs a Data Engineer with ...,7.5,Hyderabad,mid,data engineer,Data Engineer Python Pyspark Panda
1383,lead tech python infra,D. E. Shaw India,2025-12-03,full-time,We are looking for a Software Developer to joi...,7.0,Hyderabad,lead,other,Lead Tech Python Infra
1384,senior python developer,Luxoft,2025-11-30,full-time,Project description\n\nWe're seeking a strong ...,6.0,Hyderabad,senior,python developer,Senior Python Developer
1385,python developer with dagster,Staffington Global,2025-11-25,full-time,Job Description:-\n\nJob Role: Python Develope...,5.0,Hyderabad,mid,python developer,Python Developer With Dagster


In [7]:
def clean_location_func(text: str) -> str:
    if pd.isna(text):
        return ""
    t = str(text).strip()
    t_low = t.lower()
    # normalize remote formats
    if "remote" in t_low:
        return "Remote"
    return t.title()

df["clean_location"] = df["city"].astype(str).apply(clean_location_func)


In [8]:
df

Unnamed: 0,title,company,posted_date,employment_type,description,salary_avg,city,seniority_level,clean_role,clean_title,clean_location
0,data analyst predictive analytics india x d...,Boston Consulting Group,2025-12-01,full-time,Locations: Bengaluru | Gurgaon\n\nWho We Are\n...,3.0,Gurugram,mid,"analytics engineer, data analyst",Data Analyst Predictive Analytics,Gurugram
1,data analyst trainee,SparkNovaTechSolutions,2025-12-02,full-time,ABOUT THE ROLE\n\nWe are seeking a highly moti...,7.0,Jaipur,mid,"ai engineer, analytics engineer, data analyst,...",Data Analyst Trainee,Jaipur
2,sr. data analyst adtech/omnichannel remote ...,Jobgether,2025-12-01,full-time,"Sr. Data Analyst, AdTech/Omnichannel (Remote, ...",1.0,IN,senior,"analytics engineer, data analyst",Senior Data Analyst /,In
3,data analyst ecommerce & d2c insights,Wholsum Foods (Slurrp Farm and Mille),2025-12-02,full-time,Job Description: Data Analyst - Ecommerce & D2...,4.5,Gurugram,mid,"analytics engineer, data analyst",Data Analyst & D2C,Gurugram
4,senior data analyst payments,Roku,,full-time,Teamwork makes the stream work.\n\nRoku is cha...,1.0,Bengaluru,senior,"analytics engineer, data analyst",Senior Data Analyst Payments,Bengaluru
...,...,...,...,...,...,...,...,...,...,...,...
1382,data engineer python pyspark panda,LTIMindtree,2025-12-01,full-time,**Role Overview:**\n\nAs a Data Engineer with ...,7.5,Hyderabad,mid,data engineer,Data Engineer Python Pyspark Panda,Hyderabad
1383,lead tech python infra,D. E. Shaw India,2025-12-03,full-time,We are looking for a Software Developer to joi...,7.0,Hyderabad,lead,other,Lead Tech Python Infra,Hyderabad
1384,senior python developer,Luxoft,2025-11-30,full-time,Project description\n\nWe're seeking a strong ...,6.0,Hyderabad,senior,python developer,Senior Python Developer,Hyderabad
1385,python developer with dagster,Staffington Global,2025-11-25,full-time,Job Description:-\n\nJob Role: Python Develope...,5.0,Hyderabad,mid,python developer,Python Developer With Dagster,Hyderabad


In [9]:
def infer_seniority(row) -> str:
    s = str(row.get("seniority_level", "")).lower()
    t = str(row.get("title", "")).lower()
    combined = s + " " + t
    
    if any(x in combined for x in ["lead", "principal", "senior"]):
        return "senior"
    if any(x in combined for x in ["junior", "intern", "trainee", "fresher"]):
        return "junior"
    return "mid"

df["clean_seniority"] = df.apply(infer_seniority, axis=1)

In [10]:
df

Unnamed: 0,title,company,posted_date,employment_type,description,salary_avg,city,seniority_level,clean_role,clean_title,clean_location,clean_seniority
0,data analyst predictive analytics india x d...,Boston Consulting Group,2025-12-01,full-time,Locations: Bengaluru | Gurgaon\n\nWho We Are\n...,3.0,Gurugram,mid,"analytics engineer, data analyst",Data Analyst Predictive Analytics,Gurugram,mid
1,data analyst trainee,SparkNovaTechSolutions,2025-12-02,full-time,ABOUT THE ROLE\n\nWe are seeking a highly moti...,7.0,Jaipur,mid,"ai engineer, analytics engineer, data analyst,...",Data Analyst Trainee,Jaipur,junior
2,sr. data analyst adtech/omnichannel remote ...,Jobgether,2025-12-01,full-time,"Sr. Data Analyst, AdTech/Omnichannel (Remote, ...",1.0,IN,senior,"analytics engineer, data analyst",Senior Data Analyst /,In,senior
3,data analyst ecommerce & d2c insights,Wholsum Foods (Slurrp Farm and Mille),2025-12-02,full-time,Job Description: Data Analyst - Ecommerce & D2...,4.5,Gurugram,mid,"analytics engineer, data analyst",Data Analyst & D2C,Gurugram,mid
4,senior data analyst payments,Roku,,full-time,Teamwork makes the stream work.\n\nRoku is cha...,1.0,Bengaluru,senior,"analytics engineer, data analyst",Senior Data Analyst Payments,Bengaluru,senior
...,...,...,...,...,...,...,...,...,...,...,...,...
1382,data engineer python pyspark panda,LTIMindtree,2025-12-01,full-time,**Role Overview:**\n\nAs a Data Engineer with ...,7.5,Hyderabad,mid,data engineer,Data Engineer Python Pyspark Panda,Hyderabad,mid
1383,lead tech python infra,D. E. Shaw India,2025-12-03,full-time,We are looking for a Software Developer to joi...,7.0,Hyderabad,lead,other,Lead Tech Python Infra,Hyderabad,senior
1384,senior python developer,Luxoft,2025-11-30,full-time,Project description\n\nWe're seeking a strong ...,6.0,Hyderabad,senior,python developer,Senior Python Developer,Hyderabad,senior
1385,python developer with dagster,Staffington Global,2025-11-25,full-time,Job Description:-\n\nJob Role: Python Develope...,5.0,Hyderabad,mid,python developer,Python Developer With Dagster,Hyderabad,mid


In [13]:
df.to_csv("jobs_for_model.csv", index=False)
print("Saved jobs_for_model.csv")

Saved jobs_for_model.csv
