In [1]:
import pandas as pd
import re

df = pd.read_csv("glassdoor_jobs.csv")  # Replace with your actual file path

# Remove rows where 'Salary Estimate' is 'N/A'
df_cleaned = df[df['Salary Estimate'].notna()]  # Removes NaN
df_cleaned = df_cleaned[df_cleaned['Salary Estimate'] != 'N/A']  # Removes literal 'N/A'

# Optionally, reset the index
df_cleaned.reset_index(drop=True, inplace=True)



In [2]:
df_cleaned

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,
2,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,
3,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,
...,...,...,...,...,...,...
791,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,
792,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,
793,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,
794,Data Scientist,,Remote,₹5L – ₹10L/yr (Employer provided),,


In [3]:
def extract_numeric_salary(text):
    text = text.lower()
    # Match salary ranges
    match_range = re.search(r'₹?(\d+)[lk]?\s*–\s*₹?(\d+)[lk]?/([a-z]+)', text)
    if match_range:
        return f"{match_range.group(1)} – {match_range.group(2)}/{match_range.group(3)}"
    # Match single salary value
    match_single = re.search(r'₹?(\d+)[lk]?/([a-z]+)', text)
    if match_single:
        return f"{match_single.group(1)}/{match_single.group(2)}"
    return None

# Apply the function to the column
df_cleaned['Cleaned Salary'] = df_cleaned['Salary Estimate'].apply(extract_numeric_salary)



In [4]:
df_cleaned

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,,3 – 4/yr
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,,7 – 9/yr
2,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr
3,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr
...,...,...,...,...,...,...,...
791,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr
792,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr
793,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr
794,Data Scientist,,Remote,₹5L – ₹10L/yr (Employer provided),,,5 – 10/yr


In [5]:
def is_monthly(salary_str):
    if salary_str and '/mo' in salary_str:
        return 1
    return 0

df_cleaned['monthly'] = df_cleaned['Cleaned Salary'].apply(is_monthly)

In [6]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,,3 – 4/yr,0
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,,7 – 9/yr,0
2,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr,0
3,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr,0
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr,0
5,Data Scientist,,Remote,₹5L – ₹10L/yr (Employer provided),,,5 – 10/yr,0
6,Data Scientist,Confident\n4.1,Chennai,₹6L – ₹19L/yr (Employer provided),4.1,,6 – 19/yr,0
7,Data Scientist Intern,SoulPage IT Solutions\n4.4,Hyderābād,₹20K/mo (Employer provided),4.4,,20/mo,1
8,Data Scientist,,India,₹20K – ₹80K/mo (Employer provided),,,20 – 80/mo,1
9,Data Scientist,,Remote,₹4L – ₹11L/yr (Employer provided),,,4 – 11/yr,0


In [7]:
def extract_min_max(s):
    if pd.isna(s):
        return pd.Series([None, None])
    try:
        parts = s.split('/')[0].split('–')
        min_salary = int(parts[0].strip())
        max_salary = int(parts[1].strip()) if len(parts) > 1 else int(parts[0].strip())
        return pd.Series([min_salary, max_salary])
    except:
        return pd.Series([None, None])

df_cleaned[['min_salary', 'max_salary']] = df_cleaned['Cleaned Salary'].apply(extract_min_max)


In [8]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly,min_salary,max_salary
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,,3 – 4/yr,0,3,4
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,,7 – 9/yr,0,7,9
2,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr,0,4,4
3,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr,0,4,4
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr,0,4,6
5,Data Scientist,,Remote,₹5L – ₹10L/yr (Employer provided),,,5 – 10/yr,0,5,10
6,Data Scientist,Confident\n4.1,Chennai,₹6L – ₹19L/yr (Employer provided),4.1,,6 – 19/yr,0,6,19
7,Data Scientist Intern,SoulPage IT Solutions\n4.4,Hyderābād,₹20K/mo (Employer provided),4.4,,20/mo,1,20,20
8,Data Scientist,,India,₹20K – ₹80K/mo (Employer provided),,,20 – 80/mo,1,20,80
9,Data Scientist,,Remote,₹4L – ₹11L/yr (Employer provided),,,4 – 11/yr,0,4,11


In [9]:
def convert_to_rupees(row):
    factor = 1000 if row['monthly'] == 1 else 100000
    min_r = row['min_salary'] * factor
    max_r = row['max_salary'] * factor
    return pd.Series([min_r, max_r])

df_cleaned[['min_salary_rupees', 'max_salary_rupees']] = df_cleaned.apply(convert_to_rupees, axis=1)
df_cleaned['avg_salary_rupees'] = (df_cleaned['min_salary_rupees'] + df_cleaned['max_salary_rupees']) / 2


In [10]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly,min_salary,max_salary,min_salary_rupees,max_salary_rupees,avg_salary_rupees
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,,3 – 4/yr,0,3,4,300000,400000,350000.0
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,,7 – 9/yr,0,7,9,700000,900000,800000.0
2,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr,0,4,4,400000,400000,400000.0
3,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr,0,4,4,400000,400000,400000.0
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr,0,4,6,400000,600000,500000.0
5,Data Scientist,,Remote,₹5L – ₹10L/yr (Employer provided),,,5 – 10/yr,0,5,10,500000,1000000,750000.0
6,Data Scientist,Confident\n4.1,Chennai,₹6L – ₹19L/yr (Employer provided),4.1,,6 – 19/yr,0,6,19,600000,1900000,1250000.0
7,Data Scientist Intern,SoulPage IT Solutions\n4.4,Hyderābād,₹20K/mo (Employer provided),4.4,,20/mo,1,20,20,20000,20000,20000.0
8,Data Scientist,,India,₹20K – ₹80K/mo (Employer provided),,,20 – 80/mo,1,20,80,20000,80000,50000.0
9,Data Scientist,,Remote,₹4L – ₹11L/yr (Employer provided),,,4 – 11/yr,0,4,11,400000,1100000,750000.0


In [11]:
# Clean 'Company Name' by removing newline and anything after it
df_cleaned['company_name_cleaned'] = df_cleaned['Company Name'].apply(lambda x: str(x).split('\n')[0].strip())
df_cleaned = df_cleaned[
    (df_cleaned['company_name_cleaned'].notna()) & 
    (df_cleaned['company_name_cleaned'].str.lower().str.strip() != 'nan') & 
    (df_cleaned['company_name_cleaned'].str.strip() != "")
]



In [12]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly,min_salary,max_salary,min_salary_rupees,max_salary_rupees,avg_salary_rupees,company_name_cleaned
0,Data Scientist - C13 - VP - PUNE,Citi\n3.7,Pune,₹3L – ₹4L/yr (Glassdoor Est.),3.7,,3 – 4/yr,0,3,4,300000,400000,350000.0,Citi
1,Engineer – Marketing Data & AI,Electrolux\n4.0,Bengaluru,₹7L – ₹9L/yr (Glassdoor Est.),4.0,,7 – 9/yr,0,7,9,700000,900000,800000.0,Electrolux
4,DATA SCIENTIST : 0-2 Years (FRESHERS),REIZEND PRIVATE LIMITED\n3.6,India,₹4L – ₹6L/yr (Employer provided),3.6,,4 – 6/yr,0,4,6,400000,600000,500000.0,REIZEND PRIVATE LIMITED
6,Data Scientist,Confident\n4.1,Chennai,₹6L – ₹19L/yr (Employer provided),4.1,,6 – 19/yr,0,6,19,600000,1900000,1250000.0,Confident
7,Data Scientist Intern,SoulPage IT Solutions\n4.4,Hyderābād,₹20K/mo (Employer provided),4.4,,20/mo,1,20,20,20000,20000,20000.0,SoulPage IT Solutions
12,Data Scientist,Spotify\n4.0,Mumbai,₹3L – ₹9L/yr (Glassdoor Est.),4.0,,3 – 9/yr,0,3,9,300000,900000,600000.0,Spotify
13,Data Scientist,ITSWS Technologies Pvt. Ltd.\n4.0,Remote,₹60K – ₹70K/mo (Employer provided),4.0,,60 – 70/mo,1,60,70,60000,70000,65000.0,ITSWS Technologies Pvt. Ltd.
14,Junior Data Scientist,LSEG (London Stock Exchange Group)\n3.7,Bengaluru,₹7L – ₹10L/yr (Glassdoor Est.),3.7,,7 – 10/yr,0,7,10,700000,1000000,850000.0,LSEG (London Stock Exchange Group)
16,Data Scientist 1,PayPal\n3.7,Bengaluru,₹2L – ₹4L/yr (Glassdoor Est.),3.7,,2 – 4/yr,0,2,4,200000,400000,300000.0,PayPal
18,Data Scientist,Qualys\n3.6,Pune,₹10L/yr (Glassdoor Est.),3.6,,10/yr,0,10,10,1000000,1000000,1000000.0,Qualys


In [13]:
df_cleaned.to_csv("glassdoor_cleaned_withall_compname.csv", index=False)
