In [4]:
import pandas as pd
import re

df = pd.read_csv("glassdoor_jobs.csv")  # Replace with your actual file path

# Remove rows where 'Salary Estimate' is 'N/A'
df_cleaned = df[df['Salary Estimate'].notna()]  # Removes NaN
df_cleaned = df_cleaned[df_cleaned['Salary Estimate'] != 'N/A']  # Removes literal 'N/A'

# Optionally, reset the index
df_cleaned.reset_index(drop=True, inplace=True)



In [5]:
df_cleaned

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,
2,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,
3,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,
...,...,...,...,...,...,...
791,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,
792,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,
793,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,
794,Data Scientist,,Remote,₹5L – ₹10L/yr (Employer provided),,


In [8]:
def extract_numeric_salary(text):
    text = text.lower()
    # Match salary ranges
    match_range = re.search(r'₹?(\d+)[lk]?\s*–\s*₹?(\d+)[lk]?/([a-z]+)', text)
    if match_range:
        return f"{match_range.group(1)} – {match_range.group(2)}/{match_range.group(3)}"
    # Match single salary value
    match_single = re.search(r'₹?(\d+)[lk]?/([a-z]+)', text)
    if match_single:
        return f"{match_single.group(1)}/{match_single.group(2)}"
    return None

# Apply the function to the column
df_cleaned['Cleaned Salary'] = df_cleaned['Salary Estimate'].apply(extract_numeric_salary)



In [9]:
df_cleaned

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,,3 – 4/yr
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,,7 – 9/yr
2,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr
3,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr
...,...,...,...,...,...,...,...
791,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr
792,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr
793,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr
794,Data Scientist,,Remote,₹5L – ₹10L/yr (Employer provided),,,5 – 10/yr


In [10]:
def is_monthly(salary_str):
    if salary_str and '/mo' in salary_str:
        return 1
    return 0

df_cleaned['monthly'] = df_cleaned['Cleaned Salary'].apply(is_monthly)

In [16]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,,3 – 4/yr,0
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,,7 – 9/yr,0
2,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr,0
3,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr,0
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr,0
5,Data Scientist,,Remote,₹5L – ₹10L/yr (Employer provided),,,5 – 10/yr,0
6,Data Scientist,Confident\n4.1,Chennai,₹6L – ₹19L/yr (Employer provided),4.1,,6 – 19/yr,0
7,Data Scientist Intern,SoulPage IT Solutions\n4.4,Hyderābād,₹20K/mo (Employer provided),4.4,,20/mo,1
8,Data Scientist,,India,₹20K – ₹80K/mo (Employer provided),,,20 – 80/mo,1
9,Data Scientist,,Remote,₹4L – ₹11L/yr (Employer provided),,,4 – 11/yr,0


In [20]:
def extract_min_max_in_rupees(text):
    if not text:
        return None, None
    match_range = re.search(r'₹?(\d+)([LK])?\s*–\s*₹?(\d+)([LK])?/([a-z]+)', text.lower())
    match_single = re.search(r'₹?(\d+)([LK])?/([a-z]+)', text.lower())

    def to_number(value, unit):
        if unit == 'L':
            return int(value) * 100000
        elif unit == 'K':
            return int(value) * 1000
        else:
            return int(value)

    if match_range:
        min_val = to_number(match_range.group(1), match_range.group(2))
        max_val = to_number(match_range.group(3), match_range.group(4))
        return min_val, max_val

    elif match_single:
        val = to_number(match_single.group(1), match_single.group(2))
        return val, val

    return None, None

# 6. Apply to get min, max, and average salary
df_cleaned[['min_salary_rupees', 'max_salary_rupees']] = df_cleaned['Cleaned Salary'].apply(
    lambda x: pd.Series(extract_min_max_in_rupees(x))
)

df_cleaned['avg_salary_rupees'] = (df_cleaned['min_salary_rupees'] + df_cleaned['max_salary_rupees']) / 2


In [21]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly,min_salary,max_salary,avg_salary,min_salary_rupees,max_salary_rupees,avg_salary_rupees
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,,3 – 4/yr,0,3,4,3.5,,,
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,,7 – 9/yr,0,7,9,8.0,,,
2,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr,0,4,4,4.0,,,
3,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr,0,4,4,4.0,,,
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr,0,4,6,5.0,,,
5,Data Scientist,,Remote,₹5L – ₹10L/yr (Employer provided),,,5 – 10/yr,0,5,10,7.5,,,
6,Data Scientist,Confident\n4.1,Chennai,₹6L – ₹19L/yr (Employer provided),4.1,,6 – 19/yr,0,6,19,12.5,,,
7,Data Scientist Intern,SoulPage IT Solutions\n4.4,Hyderābād,₹20K/mo (Employer provided),4.4,,20/mo,1,20,20,20.0,,,
8,Data Scientist,,India,₹20K – ₹80K/mo (Employer provided),,,20 – 80/mo,1,20,80,50.0,,,
9,Data Scientist,,Remote,₹4L – ₹11L/yr (Employer provided),,,4 – 11/yr,0,4,11,7.5,,,
