In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('naukri_jobs.csv')
df.head()

Unnamed: 0,Job Title,Company Name,Rating,Experience,Salary,Location,Skills
0,Software Developer Trainee/Intern,Microcosmworks Llp,3.8,0-2 Yrs,Not disclosed,New Delhi,"Node.Js, React Native, Python Development, Rea..."
1,"Senior Software Developer (C#.net, SQL, ERP So...",Index Infotech India,4.5,4-5 Yrs,Not disclosed,Pune,"SAN, SAP, SAP CRM, SSRS, Data structures"
2,Software Developer - Java,Interface Infosoft Solutions,3.6,7-12 Yrs,Not disclosed,"Nagpur, Roorkee","Hibernate, jQuery, Db2, XML, JSP"
3,Senior Developer Department: Software,Tecknotrove,3.5,3-8 Yrs,Not disclosed,Mumbai,"Computer science, Mining, C++, GIT, Simulation"
4,Software Developer,Siemens,4.1,0-8 Yrs,Not disclosed,Bengaluru,"Automation, Networking protocols, Debugging, C..."


In [3]:
def preprocess_job_title(title):
    
    # Replace '-', '/', '@' ':' with spaces
    title = re.sub(r'[-/@:]', ' ', title)
    
    # Remove anything inside parentheses or square brackets, including the brackets
    title = re.sub(r'[\[\(].*?[\]\)]', '', title)
    
    # Remove extra spaces
    title = re.sub(r'\s+', ' ', title).strip()
    
    # Convert to lowercase
    title = title.lower()
    
    # Limit to 5    
    words = title.split()
    title = ' '.join(words[:5])
    
    # Capitalize the first letter of each word and make the rest lowercase
    words = [word.capitalize() for word in words]
    
    # Join the words back into a single string
    title = ' '.join(words)
    
    return title

In [4]:
df['Job Title'] = df['Job Title'].apply(preprocess_job_title)
df.head()

Unnamed: 0,Job Title,Company Name,Rating,Experience,Salary,Location,Skills
0,Software Developer Trainee Intern,Microcosmworks Llp,3.8,0-2 Yrs,Not disclosed,New Delhi,"Node.Js, React Native, Python Development, Rea..."
1,Senior Software Developer,Index Infotech India,4.5,4-5 Yrs,Not disclosed,Pune,"SAN, SAP, SAP CRM, SSRS, Data structures"
2,Software Developer Java,Interface Infosoft Solutions,3.6,7-12 Yrs,Not disclosed,"Nagpur, Roorkee","Hibernate, jQuery, Db2, XML, JSP"
3,Senior Developer Department Software,Tecknotrove,3.5,3-8 Yrs,Not disclosed,Mumbai,"Computer science, Mining, C++, GIT, Simulation"
4,Software Developer,Siemens,4.1,0-8 Yrs,Not disclosed,Bengaluru,"Automation, Networking protocols, Debugging, C..."


In [5]:
def process_experience(experience):
    
    # Ensure experience is a string and handle None/NaN
    if not isinstance(experience, str):
        return 0, 0
    
    # Regex to check if the experience contains a range (e.g., 2-4 yrs)
    match = re.match(r'(\d+)-(\d+)', experience)
    if match:
        # Extract the min and max experience from the range
        min_exp = int(match.group(1))
        max_exp = int(match.group(2))
        return min_exp, max_exp
    else:
        # Check for single experience value (e.g., 3 yrs)
        match = re.match(r'(\d+)', experience)
        if match:
            # If a single value, set both min and max to that value
            exp = int(match.group(1))
            return exp, exp
        else:
            # If experience is empty or invalid, return (0, 0)
            return 0, 0

In [6]:
# Apply the function to the 'Experience' column to create 'Min Experience' and 'Max Experience' columns
df[['Min Experience', 'Max Experience']] = df['Experience'].apply(lambda x: pd.Series(process_experience(x)))

# Drop the original experience column
df = df.drop(columns=['Experience'])

df

Unnamed: 0,Job Title,Company Name,Rating,Salary,Location,Skills,Min Experience,Max Experience
0,Software Developer Trainee Intern,Microcosmworks Llp,3.8,Not disclosed,New Delhi,"Node.Js, React Native, Python Development, Rea...",0,2
1,Senior Software Developer,Index Infotech India,4.5,Not disclosed,Pune,"SAN, SAP, SAP CRM, SSRS, Data structures",4,5
2,Software Developer Java,Interface Infosoft Solutions,3.6,Not disclosed,"Nagpur, Roorkee","Hibernate, jQuery, Db2, XML, JSP",7,12
3,Senior Developer Department Software,Tecknotrove,3.5,Not disclosed,Mumbai,"Computer science, Mining, C++, GIT, Simulation",3,8
4,Software Developer,Siemens,4.1,Not disclosed,Bengaluru,"Automation, Networking protocols, Debugging, C...",0,8
...,...,...,...,...,...,...,...,...
1995,Junior Software Developer,Newgen Payment Gateway Pvt. Ltd.,,Not disclosed,Delhi,"Hibernate, XML, MySQL, Debugging, Struts",2,5
1996,Yash Technologies Is Hiring Sr. Software Engin...,Yash Technologies,3.8,Not disclosed,Hybrid - Pune,"Linux, Shell Scripting, Openshift, Openshift D...",4,6
1997,Senior Software Engineer Fullstack Developer,FactSet,4.0,Not disclosed,Hyderabad,"HTML;Javascript, GIT, Coding, Analytical, Agil...",5,10
1998,Senior Application Software Engineer,Oceaneering International,4.1,Not disclosed,Chandigarh,"C++, CPP, CPLUSPLUS, Application, Senior",3,8


In [7]:
def process_salary(salary):
    # Check for missing or undisclosed salary and return NaN
    if not isinstance(salary, str) or 'Not disclosed' in salary.lower():
        return np.nan
    
    # Check for a salary range (e.g., '3-5 Lacs PA')
    match = re.match(r'(\d+)-(\d+)', salary)
    if match:
        # Calculate the average and convert to integer (in lakhs, so multiply by 100,000)
        min_salary = int(match.group(1))
        max_salary = int(match.group(2))
        avg_salary = (min_salary + max_salary) / 2 * 100000
        return avg_salary
    else:
        # Check for a single salary value (e.g., '7 Lacs PA')
        match = re.match(r'(\d+)', salary)
        if match:
            # Convert to integer (in lakhs, so multiply by 100,000)
            avg_salary = int(match.group(1)) * 100000
            return avg_salary
        else:
            # Return NaN if the format is unexpected
            return np.nan

In [8]:
# Apply the function to create the Avg Salary column
df['Avg Salary'] = df['Salary'].apply(process_salary)

# Drop the original salary column
df = df.drop(columns=['Salary'])
df

Unnamed: 0,Job Title,Company Name,Rating,Location,Skills,Min Experience,Max Experience,Avg Salary
0,Software Developer Trainee Intern,Microcosmworks Llp,3.8,New Delhi,"Node.Js, React Native, Python Development, Rea...",0,2,
1,Senior Software Developer,Index Infotech India,4.5,Pune,"SAN, SAP, SAP CRM, SSRS, Data structures",4,5,
2,Software Developer Java,Interface Infosoft Solutions,3.6,"Nagpur, Roorkee","Hibernate, jQuery, Db2, XML, JSP",7,12,
3,Senior Developer Department Software,Tecknotrove,3.5,Mumbai,"Computer science, Mining, C++, GIT, Simulation",3,8,
4,Software Developer,Siemens,4.1,Bengaluru,"Automation, Networking protocols, Debugging, C...",0,8,
...,...,...,...,...,...,...,...,...
1995,Junior Software Developer,Newgen Payment Gateway Pvt. Ltd.,,Delhi,"Hibernate, XML, MySQL, Debugging, Struts",2,5,
1996,Yash Technologies Is Hiring Sr. Software Engin...,Yash Technologies,3.8,Hybrid - Pune,"Linux, Shell Scripting, Openshift, Openshift D...",4,6,
1997,Senior Software Engineer Fullstack Developer,FactSet,4.0,Hyderabad,"HTML;Javascript, GIT, Coding, Analytical, Agil...",5,10,
1998,Senior Application Software Engineer,Oceaneering International,4.1,Chandigarh,"C++, CPP, CPLUSPLUS, Application, Senior",3,8,


In [9]:
def process_job_location(location):
    
    # Ensure location is a string; if not, return NaN
    if not isinstance(location, str) or location.strip() == '':
        return np.nan
    
    # Split location by space, comma, slash, or hyphen and get the first part
    first_location = re.split(r'[,\-\/]+', location.strip())[0]
    
    # Capitalize the first letter of each word and make the rest lowercase
    formated_location = ' '.join(word.capitalize() for word in first_location.strip().split())
    
    return formated_location

In [10]:
df['Location'] = df['Location'].apply(process_job_location)
df

Unnamed: 0,Job Title,Company Name,Rating,Location,Skills,Min Experience,Max Experience,Avg Salary
0,Software Developer Trainee Intern,Microcosmworks Llp,3.8,New Delhi,"Node.Js, React Native, Python Development, Rea...",0,2,
1,Senior Software Developer,Index Infotech India,4.5,Pune,"SAN, SAP, SAP CRM, SSRS, Data structures",4,5,
2,Software Developer Java,Interface Infosoft Solutions,3.6,Nagpur,"Hibernate, jQuery, Db2, XML, JSP",7,12,
3,Senior Developer Department Software,Tecknotrove,3.5,Mumbai,"Computer science, Mining, C++, GIT, Simulation",3,8,
4,Software Developer,Siemens,4.1,Bengaluru,"Automation, Networking protocols, Debugging, C...",0,8,
...,...,...,...,...,...,...,...,...
1995,Junior Software Developer,Newgen Payment Gateway Pvt. Ltd.,,Delhi,"Hibernate, XML, MySQL, Debugging, Struts",2,5,
1996,Yash Technologies Is Hiring Sr. Software Engin...,Yash Technologies,3.8,Hybrid,"Linux, Shell Scripting, Openshift, Openshift D...",4,6,
1997,Senior Software Engineer Fullstack Developer,FactSet,4.0,Hyderabad,"HTML;Javascript, GIT, Coding, Analytical, Agil...",5,10,
1998,Senior Application Software Engineer,Oceaneering International,4.1,Chandigarh,"C++, CPP, CPLUSPLUS, Application, Senior",3,8,


In [11]:
def process_skills(skills):
    
    # Ensure skills is a string; if not, return NaN
    if not isinstance(skills, str):
        return np.nan
    
    # Remove extra spaces and convert to lowercase
    formated_skills = ' '.join(skills.split()).lower()
    
    return formated_skills

In [12]:
df['Skills'] = df['Skills'].apply(process_skills)
df

Unnamed: 0,Job Title,Company Name,Rating,Location,Skills,Min Experience,Max Experience,Avg Salary
0,Software Developer Trainee Intern,Microcosmworks Llp,3.8,New Delhi,"node.js, react native, python development, rea...",0,2,
1,Senior Software Developer,Index Infotech India,4.5,Pune,"san, sap, sap crm, ssrs, data structures",4,5,
2,Software Developer Java,Interface Infosoft Solutions,3.6,Nagpur,"hibernate, jquery, db2, xml, jsp",7,12,
3,Senior Developer Department Software,Tecknotrove,3.5,Mumbai,"computer science, mining, c++, git, simulation",3,8,
4,Software Developer,Siemens,4.1,Bengaluru,"automation, networking protocols, debugging, c...",0,8,
...,...,...,...,...,...,...,...,...
1995,Junior Software Developer,Newgen Payment Gateway Pvt. Ltd.,,Delhi,"hibernate, xml, mysql, debugging, struts",2,5,
1996,Yash Technologies Is Hiring Sr. Software Engin...,Yash Technologies,3.8,Hybrid,"linux, shell scripting, openshift, openshift d...",4,6,
1997,Senior Software Engineer Fullstack Developer,FactSet,4.0,Hyderabad,"html;javascript, git, coding, analytical, agil...",5,10,
1998,Senior Application Software Engineer,Oceaneering International,4.1,Chandigarh,"c++, cpp, cplusplus, application, senior",3,8,


In [13]:
# Specify the desired column order
column_order = ['Job Title', 'Company Name', 'Rating', 'Location', 'Min Experience', 'Max Experience','Skills']

# Reorder the DataFrame columns
df = df[column_order]
df

Unnamed: 0,Job Title,Company Name,Rating,Location,Min Experience,Max Experience,Skills
0,Software Developer Trainee Intern,Microcosmworks Llp,3.8,New Delhi,0,2,"node.js, react native, python development, rea..."
1,Senior Software Developer,Index Infotech India,4.5,Pune,4,5,"san, sap, sap crm, ssrs, data structures"
2,Software Developer Java,Interface Infosoft Solutions,3.6,Nagpur,7,12,"hibernate, jquery, db2, xml, jsp"
3,Senior Developer Department Software,Tecknotrove,3.5,Mumbai,3,8,"computer science, mining, c++, git, simulation"
4,Software Developer,Siemens,4.1,Bengaluru,0,8,"automation, networking protocols, debugging, c..."
...,...,...,...,...,...,...,...
1995,Junior Software Developer,Newgen Payment Gateway Pvt. Ltd.,,Delhi,2,5,"hibernate, xml, mysql, debugging, struts"
1996,Yash Technologies Is Hiring Sr. Software Engin...,Yash Technologies,3.8,Hybrid,4,6,"linux, shell scripting, openshift, openshift d..."
1997,Senior Software Engineer Fullstack Developer,FactSet,4.0,Hyderabad,5,10,"html;javascript, git, coding, analytical, agil..."
1998,Senior Application Software Engineer,Oceaneering International,4.1,Chandigarh,3,8,"c++, cpp, cplusplus, application, senior"


In [14]:
df.to_csv('naukri_jobs_pp.csv', index=False)