In [1]:
import pandas as pd
import re

df = pd.read_csv("glassdoor_jobs_final.csv")  # Replace with your actual file path

# Remove rows where 'Salary Estimate' is 'N/A'
df_cleaned = df[df['Salary Estimate'].notna()]  # Removes NaN
df_cleaned = df_cleaned[df_cleaned['Salary Estimate'] != 'N/A']  # Removes literal 'N/A'

# Optionally, reset the index
df_cleaned.reset_index(drop=True, inplace=True)



In [2]:
df_cleaned

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description
0,Data Scientist,Nirmalya Labs\n4.3,Bhubaneshwar,₹10L – ₹23L/yr (Employer provided),4.3,
1,Data Science Trainer,,India,₹25K – ₹35K/mo (Employer provided),,
2,Data Analytics Lead Analyst - C13 - BANGALORE,Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,
3,Business Analyst – Digital Analytics ( Adobe ),Citi\n3.7,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.7,
4,"Sr. Consultant, Data Science and Analytics",TransUnion\n4.0,Bengaluru,₹10L/yr (Glassdoor Est.),4.0,
...,...,...,...,...,...,...
437,CFL Data Scientist,SUSE\n4.0,Bengaluru,₹4L – ₹10L/yr (Glassdoor Est.),4.0,
438,Principal Member of technical Staff- AI & Data...,Athenahealth\n4.1,Chennai,₹5L – ₹8L/yr (Glassdoor Est.),4.1,
439,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,
440,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,


In [3]:
def extract_numeric_salary(text):
    text = text.lower()
    # Match salary ranges
    match_range = re.search(r'₹?(\d+)[lk]?\s*–\s*₹?(\d+)[lk]?/([a-z]+)', text)
    if match_range:
        return f"{match_range.group(1)} – {match_range.group(2)}/{match_range.group(3)}"
    # Match single salary value
    match_single = re.search(r'₹?(\d+)[lk]?/([a-z]+)', text)
    if match_single:
        return f"{match_single.group(1)}/{match_single.group(2)}"
    return None

# Apply the function to the column
df_cleaned['Cleaned Salary'] = df_cleaned['Salary Estimate'].apply(extract_numeric_salary)



In [4]:
df_cleaned

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary
0,Data Scientist,Nirmalya Labs\n4.3,Bhubaneshwar,₹10L – ₹23L/yr (Employer provided),4.3,,10 – 23/yr
1,Data Science Trainer,,India,₹25K – ₹35K/mo (Employer provided),,,25 – 35/mo
2,Data Analytics Lead Analyst - C13 - BANGALORE,Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,,6 – 10/yr
3,Business Analyst – Digital Analytics ( Adobe ),Citi\n3.7,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.7,,5 – 10/yr
4,"Sr. Consultant, Data Science and Analytics",TransUnion\n4.0,Bengaluru,₹10L/yr (Glassdoor Est.),4.0,,10/yr
...,...,...,...,...,...,...,...
437,CFL Data Scientist,SUSE\n4.0,Bengaluru,₹4L – ₹10L/yr (Glassdoor Est.),4.0,,4 – 10/yr
438,Principal Member of technical Staff- AI & Data...,Athenahealth\n4.1,Chennai,₹5L – ₹8L/yr (Glassdoor Est.),4.1,,5 – 8/yr
439,Python/AI Developer (FastAPI & MongoDB Special...,,Mohali,₹4L/yr (Employer provided),,,4/yr
440,NLP Project Manager,,Coimbatore,₹4L/yr (Employer provided),,,4/yr


In [5]:
def is_monthly(salary_str):
    if salary_str and '/mo' in salary_str:
        return 1
    return 0

df_cleaned['monthly'] = df_cleaned['Cleaned Salary'].apply(is_monthly)

In [6]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly
0,Data Scientist,Nirmalya Labs\n4.3,Bhubaneshwar,₹10L – ₹23L/yr (Employer provided),4.3,,10 – 23/yr,0
1,Data Science Trainer,,India,₹25K – ₹35K/mo (Employer provided),,,25 – 35/mo,1
2,Data Analytics Lead Analyst - C13 - BANGALORE,Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,,6 – 10/yr,0
3,Business Analyst – Digital Analytics ( Adobe ),Citi\n3.7,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.7,,5 – 10/yr,0
4,"Sr. Consultant, Data Science and Analytics",TransUnion\n4.0,Bengaluru,₹10L/yr (Glassdoor Est.),4.0,,10/yr,0
5,AI Developer,,Hyderābād,₹6L – ₹8L/yr (Employer provided),,,6 – 8/yr,0
6,Lead Data Analytics Analyst (Vice President),Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,,6 – 10/yr,0
7,AI Operations and Optimization Manager,Ecolab Inc.\n3.7,Pune,₹5L – ₹6L/yr (Glassdoor Est.),3.7,,5 – 6/yr,0
8,"Data Scientist II, Last Mile Science",ADCI - Karnataka\n3.6,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.6,,5 – 10/yr,0
9,Data Science Specialist - OptimusAI (Mining/Me...,McKinsey & Company\n4.1,Bengaluru,₹4L – ₹10L/yr (Glassdoor Est.),4.1,,4 – 10/yr,0


In [7]:
def extract_min_max(s):
    if pd.isna(s):
        return pd.Series([None, None])
    try:
        parts = s.split('/')[0].split('–')
        min_salary = int(parts[0].strip())
        max_salary = int(parts[1].strip()) if len(parts) > 1 else int(parts[0].strip())
        return pd.Series([min_salary, max_salary])
    except:
        return pd.Series([None, None])

df_cleaned[['min_salary', 'max_salary']] = df_cleaned['Cleaned Salary'].apply(extract_min_max)


In [8]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly,min_salary,max_salary
0,Data Scientist,Nirmalya Labs\n4.3,Bhubaneshwar,₹10L – ₹23L/yr (Employer provided),4.3,,10 – 23/yr,0,10,23
1,Data Science Trainer,,India,₹25K – ₹35K/mo (Employer provided),,,25 – 35/mo,1,25,35
2,Data Analytics Lead Analyst - C13 - BANGALORE,Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,,6 – 10/yr,0,6,10
3,Business Analyst – Digital Analytics ( Adobe ),Citi\n3.7,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.7,,5 – 10/yr,0,5,10
4,"Sr. Consultant, Data Science and Analytics",TransUnion\n4.0,Bengaluru,₹10L/yr (Glassdoor Est.),4.0,,10/yr,0,10,10
5,AI Developer,,Hyderābād,₹6L – ₹8L/yr (Employer provided),,,6 – 8/yr,0,6,8
6,Lead Data Analytics Analyst (Vice President),Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,,6 – 10/yr,0,6,10
7,AI Operations and Optimization Manager,Ecolab Inc.\n3.7,Pune,₹5L – ₹6L/yr (Glassdoor Est.),3.7,,5 – 6/yr,0,5,6
8,"Data Scientist II, Last Mile Science",ADCI - Karnataka\n3.6,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.6,,5 – 10/yr,0,5,10
9,Data Science Specialist - OptimusAI (Mining/Me...,McKinsey & Company\n4.1,Bengaluru,₹4L – ₹10L/yr (Glassdoor Est.),4.1,,4 – 10/yr,0,4,10


In [9]:
def convert_to_rupees(row):
    factor = 1000 if row['monthly'] == 1 else 100000
    min_r = row['min_salary'] * factor
    max_r = row['max_salary'] * factor
    return pd.Series([min_r, max_r])

df_cleaned[['min_salary_rupees', 'max_salary_rupees']] = df_cleaned.apply(convert_to_rupees, axis=1)
df_cleaned['avg_salary_rupees'] = (df_cleaned['min_salary_rupees'] + df_cleaned['max_salary_rupees']) / 2


In [10]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly,min_salary,max_salary,min_salary_rupees,max_salary_rupees,avg_salary_rupees
0,Data Scientist,Nirmalya Labs\n4.3,Bhubaneshwar,₹10L – ₹23L/yr (Employer provided),4.3,,10 – 23/yr,0,10,23,1000000,2300000,1650000.0
1,Data Science Trainer,,India,₹25K – ₹35K/mo (Employer provided),,,25 – 35/mo,1,25,35,25000,35000,30000.0
2,Data Analytics Lead Analyst - C13 - BANGALORE,Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,,6 – 10/yr,0,6,10,600000,1000000,800000.0
3,Business Analyst – Digital Analytics ( Adobe ),Citi\n3.7,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.7,,5 – 10/yr,0,5,10,500000,1000000,750000.0
4,"Sr. Consultant, Data Science and Analytics",TransUnion\n4.0,Bengaluru,₹10L/yr (Glassdoor Est.),4.0,,10/yr,0,10,10,1000000,1000000,1000000.0
5,AI Developer,,Hyderābād,₹6L – ₹8L/yr (Employer provided),,,6 – 8/yr,0,6,8,600000,800000,700000.0
6,Lead Data Analytics Analyst (Vice President),Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,,6 – 10/yr,0,6,10,600000,1000000,800000.0
7,AI Operations and Optimization Manager,Ecolab Inc.\n3.7,Pune,₹5L – ₹6L/yr (Glassdoor Est.),3.7,,5 – 6/yr,0,5,6,500000,600000,550000.0
8,"Data Scientist II, Last Mile Science",ADCI - Karnataka\n3.6,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.6,,5 – 10/yr,0,5,10,500000,1000000,750000.0
9,Data Science Specialist - OptimusAI (Mining/Me...,McKinsey & Company\n4.1,Bengaluru,₹4L – ₹10L/yr (Glassdoor Est.),4.1,,4 – 10/yr,0,4,10,400000,1000000,700000.0


In [11]:
# Clean 'Company Name' by removing newline and anything after it
df_cleaned['company_name_cleaned'] = df_cleaned['Company Name'].apply(lambda x: str(x).split('\n')[0].strip())
df_cleaned = df_cleaned[
    (df_cleaned['company_name_cleaned'].notna()) & 
    (df_cleaned['company_name_cleaned'].str.lower().str.strip() != 'nan') & 
    (df_cleaned['company_name_cleaned'].str.strip() != "")
]



In [12]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Cleaned Salary,monthly,min_salary,max_salary,min_salary_rupees,max_salary_rupees,avg_salary_rupees,company_name_cleaned
0,Data Scientist,Nirmalya Labs\n4.3,Bhubaneshwar,₹10L – ₹23L/yr (Employer provided),4.3,,10 – 23/yr,0,10,23,1000000,2300000,1650000.0,Nirmalya Labs
2,Data Analytics Lead Analyst - C13 - BANGALORE,Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,,6 – 10/yr,0,6,10,600000,1000000,800000.0,Citi
3,Business Analyst – Digital Analytics ( Adobe ),Citi\n3.7,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.7,,5 – 10/yr,0,5,10,500000,1000000,750000.0,Citi
4,"Sr. Consultant, Data Science and Analytics",TransUnion\n4.0,Bengaluru,₹10L/yr (Glassdoor Est.),4.0,,10/yr,0,10,10,1000000,1000000,1000000.0,TransUnion
6,Lead Data Analytics Analyst (Vice President),Citi\n3.7,Bengaluru,₹6L – ₹10L/yr (Glassdoor Est.),3.7,,6 – 10/yr,0,6,10,600000,1000000,800000.0,Citi
7,AI Operations and Optimization Manager,Ecolab Inc.\n3.7,Pune,₹5L – ₹6L/yr (Glassdoor Est.),3.7,,5 – 6/yr,0,5,6,500000,600000,550000.0,Ecolab Inc.
8,"Data Scientist II, Last Mile Science",ADCI - Karnataka\n3.6,Bengaluru,₹5L – ₹10L/yr (Glassdoor Est.),3.6,,5 – 10/yr,0,5,10,500000,1000000,750000.0,ADCI - Karnataka
9,Data Science Specialist - OptimusAI (Mining/Me...,McKinsey & Company\n4.1,Bengaluru,₹4L – ₹10L/yr (Glassdoor Est.),4.1,,4 – 10/yr,0,4,10,400000,1000000,700000.0,McKinsey & Company
10,Senior Data Scientist,Gartner\n4.0,Gurgaon,₹7L – ₹9L/yr (Glassdoor Est.),4.0,,7 – 9/yr,0,7,9,700000,900000,800000.0,Gartner
12,Data Analyst,Stratefix Consulting\n3.8,Surat,₹35K/mo (Employer provided),3.8,,35/mo,1,35,35,35000,35000,35000.0,Stratefix Consulting


In [13]:
# df_cleaned.to_csv("glassdoor_cleaned_withall_compname.csv", index=False)
# # 

In [14]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 338 entries, 0 to 438
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Job Title             338 non-null    object 
 1   Company Name          338 non-null    object 
 2   Location              323 non-null    object 
 3   Salary Estimate       338 non-null    object 
 4   Rating                324 non-null    float64
 5   Job Description       0 non-null      float64
 6   Cleaned Salary        338 non-null    object 
 7   monthly               338 non-null    int64  
 8   min_salary            338 non-null    int64  
 9   max_salary            338 non-null    int64  
 10  min_salary_rupees     338 non-null    int64  
 11  max_salary_rupees     338 non-null    int64  
 12  avg_salary_rupees     338 non-null    float64
 13  company_name_cleaned  338 non-null    object 
dtypes: float64(3), int64(5), object(6)
memory usage: 39.6+ KB


In [15]:
df_cleaned = df_cleaned.drop(columns='Company Name')
df_cleaned = df_cleaned.drop(columns='Salary Estimate')
df_cleaned = df_cleaned.drop(columns='Job Description')
df_cleaned = df_cleaned.drop(columns='Cleaned Salary')
df_cleaned = df_cleaned.drop(columns='min_salary')
df_cleaned = df_cleaned.drop(columns='max_salary')




In [16]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 338 entries, 0 to 438
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Job Title             338 non-null    object 
 1   Location              323 non-null    object 
 2   Rating                324 non-null    float64
 3   monthly               338 non-null    int64  
 4   min_salary_rupees     338 non-null    int64  
 5   max_salary_rupees     338 non-null    int64  
 6   avg_salary_rupees     338 non-null    float64
 7   company_name_cleaned  338 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 23.8+ KB


In [17]:
df_cleaned.head(20)

Unnamed: 0,Job Title,Location,Rating,monthly,min_salary_rupees,max_salary_rupees,avg_salary_rupees,company_name_cleaned
0,Data Scientist,Bhubaneshwar,4.3,0,1000000,2300000,1650000.0,Nirmalya Labs
2,Data Analytics Lead Analyst - C13 - BANGALORE,Bengaluru,3.7,0,600000,1000000,800000.0,Citi
3,Business Analyst – Digital Analytics ( Adobe ),Bengaluru,3.7,0,500000,1000000,750000.0,Citi
4,"Sr. Consultant, Data Science and Analytics",Bengaluru,4.0,0,1000000,1000000,1000000.0,TransUnion
6,Lead Data Analytics Analyst (Vice President),Bengaluru,3.7,0,600000,1000000,800000.0,Citi
7,AI Operations and Optimization Manager,Pune,3.7,0,500000,600000,550000.0,Ecolab Inc.
8,"Data Scientist II, Last Mile Science",Bengaluru,3.6,0,500000,1000000,750000.0,ADCI - Karnataka
9,Data Science Specialist - OptimusAI (Mining/Me...,Bengaluru,4.1,0,400000,1000000,700000.0,McKinsey & Company
10,Senior Data Scientist,Gurgaon,4.0,0,700000,900000,800000.0,Gartner
12,Data Analyst,Surat,3.8,1,35000,35000,35000.0,Stratefix Consulting


In [19]:
df_cleaned =df_cleaned.drop_duplicates()


In [20]:
df_cleaned.shape

(338, 8)

In [21]:
df_cleaned.to_csv("glassdoor_cleaned.csv", index=False)
