# Name: Sandipto Sanyal
# PGID: 12010004

# Requirements
Used Chrome driver for version Chrome version 83.<br>
For other chrome versions Download the related driver from: https://chromedriver.chromium.org/downloads

# Packages required
Uncomment and run if not the below packages are installed

In [1]:
#!pip install selenium
#!pip install beautifulsoup4
#!pip install pandas

# Import libraries

In [2]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import bs4
import re
import pandas as pd
from selenium.common import exceptions
import warnings

# User configurable variables
search_criteria and pages_to_scan are user configurable. They denote the course names and number of pages to scan for each course name.<br>
To reduce the running time we are keeping number of pages to perform extract data as 25.

In [3]:
search_criteria = 'Information Technology'
location_criteria = 'Hyderabad'
pages_to_scan = 25

# Launch the chrome browser

In [4]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument("--test-type")
locationOfWebdriver = "./chromedriver_win32/chromedriver.exe"

driver = webdriver.Chrome(locationOfWebdriver)

# Visit Naukri.com

In [5]:
driver.get('https://www.naukri.com/')
time.sleep(5)

# Search for jobs through the search bar

In [6]:
def search_the_job(search_criteria: str, 
                      location_criteria: str):
    '''
    Run the website actions like scrolling to load the price tags
    '''
    def find_the_course_search_box():
        '''
        Target the search box using native Javascript and interact with it.
        Highlevel selenium API was not able to interact with the searchbox
        '''
        course_search_box = driver.find_element_by_xpath('//input[@name="keyword" \
                                                          ]')
        location_search_box = driver.find_element_by_xpath('//input[@name="location" \
                                                          ]')
        # selenium highlevel API was not able to interact with the search box
        # so went into javascript level to
        js_actions = "arguments[0].value='{}';arguments[1].value = '{}'".format(search_criteria, location_criteria)
        driver.execute_script(js_actions, 
                              course_search_box,
                              location_search_box
                             )
        course_search_box.submit()
        
    # Now do a new search for the course in search bar
    course_to_search = search_criteria
    find_the_course_search_box()
    time.sleep(3)

In [7]:
search_the_job(search_criteria, location_criteria)

# Go to the next page
Below we define a method to go to the next page

In [8]:
def go_to_next_page():
    print('going to next page...')
    next_button = driver.find_element_by_xpath('//a[@class="fright fs14 btn-secondary br2"]')
    js_actions = "arguments[0].click()"
    driver.execute_script(js_actions, 
                          next_button,
                         )
    time.sleep(10)

# Loop with all the search criteria and the individual individual whole page to extract course related information
Go to the automated chrome browser to view how selenium is putting the search criteria one by one

## Clean the skills required tag
This is to remove the tag elements of the skills required section of each job to join as strings.

In [9]:
def clean_skills_required(skillset_tag_parent_tag: bs4.element.Tag):
    '''
    This method parses the elements under tag: <ul class="tags has-description">
    skillset_tag_parent_tag: bs4 element
    '''
    skills_tags = skillset_tag_parent_tag.find_all('li',class_='fleft fs12 grey-text lh16 dot')
    skills = []
    for skill_tag in skills_tags:
        skills.append(skill_tag.get_text())
    return ','.join(skills)

## Columns we will be extracting
Since naukri.com provides the following features of a particular job, we are extracting them as it is:<br>
'search_criteria', 'job_title',
'company_name', 
'experience', 
'salary', 
'location', 
'experience_description', 
'skills_required', 
'days_posted'

In [10]:
data = {'search_criteria':[],
        'job_title':[],
        'company_name':[],
        'experience':[],
        'salary':[],
        'location':[],
        'experience_description':[],
        'skills_required':[],
        'days_posted':[],
       }
for page in range(1,pages_to_scan+1):
    print('URL scraping: {}'.format(driver.current_url))
    # get the page xml
    soup = BeautifulSoup(driver.page_source, 'lxml')
    # list the jobs
    courses = soup.find_all('article',class_='jobTuple bgWhite br4 mb-8')
    print('Number of jobs found on page {}: {}'.format(page, len(courses)))
    for course in courses:
        try: 
            data['search_criteria'].append(search_criteria)
        except: 
            data['search_criteria'].append(None)
        try:
            data['job_title'].append(course.div.div.a.get_text()); 
        except: 
            data['job_title'].append(None)
        try: 
            data['company_name'].append(course.find('div',class_='mt-7 companyInfo subheading lh16').a.get_text()); 
        except: 
            data['company_name'].append(None)
        try: 
            data['experience'].append(course.find('ul',class_='mt-7').find('li', class_='fleft grey-text br2 placeHolderLi experience').span.get_text()); 
        except: 
            data['experience'].append(None)
        try: 
            data['salary'].append(course.find('ul',class_='mt-7').find('li', class_='fleft grey-text br2 placeHolderLi salary').span.get_text()); 
        except: 
            data['salary'].append(None)
        try: 
            data['location'].append(course.find('ul',class_='mt-7').find('li', class_='fleft grey-text br2 placeHolderLi location').span.get_text()); 
        except: 
            data['location'].append(None)
        try: 
            data['experience_description'].append(course.find('div',class_='job-description fs12 grey-text').get_text());
        except: 
            data['experience_description'].append(None)
        try:
            skills = clean_skills_required(course.find('ul',class_='tags has-description'))
            data['skills_required'].append(skills);
        except: 
            data['skills_required'].append(None)
        try: 
            data['days_posted'].append(course.find('div', class_='jobTupleFooter mt-20').div.span.get_text()); 
        except: 
            data['days_posted'].append(None)
    try:
        # visit next page
        go_to_next_page()
    except:
        try:
            go_to_next_page()
        except exceptions.ElementClickInterceptedException:
            try:
                go_to_next_page()
            except Exception as e:
                warnings.warn("Warning........... Couldn\'t navigate to next page: Exception occured: {}".format(e))
                pass
print('Search process ended')

URL scraping: https://www.naukri.com/information-technology-jobs-in-hyderabad-secunderabad
Number of jobs found on page 1: 20
going to next page...
URL scraping: https://www.naukri.com/information-technology-jobs-in-hyderabad-secunderabad-2
Number of jobs found on page 2: 20
going to next page...
URL scraping: https://www.naukri.com/information-technology-jobs-in-hyderabad-secunderabad-3
Number of jobs found on page 3: 20
going to next page...
URL scraping: https://www.naukri.com/information-technology-jobs-in-hyderabad-secunderabad-4
Number of jobs found on page 4: 20
going to next page...
URL scraping: https://www.naukri.com/information-technology-jobs-in-hyderabad-secunderabad-5
Number of jobs found on page 5: 20
going to next page...
URL scraping: https://www.naukri.com/information-technology-jobs-in-hyderabad-secunderabad-6
Number of jobs found on page 6: 20
going to next page...
URL scraping: https://www.naukri.com/information-technology-jobs-in-hyderabad-secunderabad-7
Number of

In [11]:
df = pd.DataFrame(data=data)
display(df.head())
df.to_csv('Hyderabad_Sandipto_Sanyal.csv',index=False)

Unnamed: 0,search_criteria,job_title,company_name,experience,salary,location,experience_description,skills_required,days_posted
0,Information Technology,Information Technology (.Net),Cognia Technologies,2-4 Yrs,Not disclosed,Hyderabad,"Information Technology (.Net) B.Tech , BBA , ...","assembly language,Six Sigma,JavaScript,Real Ti...",30+ Days Ago
1,Information Technology,GENERAL MANAGER (INFORMATION TECHNOLOGY),Hyderabad Race Club,10-15 Yrs,Not disclosed,Hyderabad,# Job Code : JOB3090 Job Description Hyderaba...,"INFORMATION TECHNOLOGY,General Manager IT,Debu...",30+ Days Ago
2,Information Technology,GENERAL MANAGER (INFORMATION TECHNOLOGY),Hyderabad Race Club,10-15 Yrs,Not disclosed,Hyderabad,GENERAL MANAGER (INFORMATION TECHNOLOGY) Job ...,"Computer science,management information system...",30+ Days Ago
3,Information Technology,Information Technology,TERRA TECHNOLOGY,1-3 Yrs,Not disclosed,Hyderabad,Knowledge and experience with ticketing tools...,"Unix,Service management,global operations,Linu...",30+ Days Ago
4,Information Technology,Information Technology,TERRA TECHNOLOGY,1-3 Yrs,Not disclosed,Hyderabad,•Knowledge and experience with ticketing tool...,"Unix,Service management,global operations,Data...",30+ Days Ago


# Stop the driver

In [12]:
driver.quit()