# Wed Scraping [in.indeed.com](https://in.indeed.com/)

## Importing Required Libraries

In [None]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time

## Testing Code

In [None]:
URL = 'https://in.indeed.com/jobs?q=data+analyst&l=Delhi'
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="content-type"/>
  <script id="polyfill-script-bundle">
   /* Disable minification (remove `.min` from URL path) for more info */

(function(self, undefined) {function ArrayCreate(r){if(1/r==-Infinity&&(r=0),r>Math.pow(2,32)-1)throw new RangeError("Invalid array length");var n=[];return n.length=r,n}function Call(t,l){var n=arguments.length>2?arguments[2]:[];if(!1===IsCallable(t))throw new TypeError(Object.prototype.toString.call(t)+"is not a function.");return t.apply(l,n)}function Get(n,t){return n[t]}function HasOwnProperty(r,t){return Object.prototype.hasOwnProperty.call(r,t)}function HasProperty(n,r){return r in n}function IsArray(r){return"[object Array]"===Object.prototype.toString.call(r)}function IsCallable(n){return"function"==typeof n}function RequireObjectCoercible(e){if(null===e||e===undefined)throw TypeError();return e}function SameValueNonNumber(e,n){return e===n}funct

In [None]:
def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
            jobs.append(a['title'])
    return(jobs)

extract_job_title_from_result(soup)

['Fraud Analytics, Universal Data Analyst',
 'Risk Advisory - Analyst - A&IC -Internal Audit',
 'Analyst, Reporting and Data',
 'Data Analyst',
 'Lead Analyst / process Lead - Data Management',
 'Senior Data Analyst',
 'Senior Data Analyst',
 'Data Analyst (Quantum)',
 'Data Analyst (Dimension, DP-QA)',
 'Business Analyst, Insurance Analyst, Data Analyst',
 'Data Analyst',
 'Analyst - Data Engineer',
 'DATA ANALYST',
 'Data Analysis & Statistical Modelling Internship',
 'Senior Data Analyst']

In [None]:
def extract_company_from_result(soup): 
    companies = []
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        company = div.find_all(name='span', attrs={'class':'company'})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
            for span in sec_try:
                companies.append(span.text.strip())
    return(companies)
 
extract_company_from_result(soup)

['NatWest Group',
 'Deloitte',
 'Rockwell Automation',
 'Mind Works Global',
 'TechnipFMC',
 'The Boston Consulting Group',
 'ChargePoint',
 'E2E Research Pvt. Ltd.',
 'E2E Research Pvt. Ltd.',
 'Winning Minds LLP',
 'RELX Group',
 'The Boston Consulting Group',
 'Knowledge Excel Services.',
 'Chainlink Technology Private Limited',
 'SourceGain Consulting Pvt Ltd']

In [None]:
def extract_location_from_result(soup): 
    locations = []
    spans = soup.findAll('span', attrs={'class': 'location'})
    for span in spans:
        locations.append(span.text)
    return(locations)

extract_location_from_result(soup)

['Delhi, Delhi',
 'Delhi, Delhi',
 'New Delhi, Delhi',
 'Delhi, Delhi',
 'New Delhi, Delhi',
 'New Delhi, Delhi',
 'Delhi',
 'New Delhi, Delhi',
 'New Delhi, Delhi',
 'Delhi, Delhi',
 'New Delhi, Delhi',
 'New Delhi, Delhi',
 'Delhi, Delhi',
 'Delhi, Delhi',
 'Delhi, Delhi']

In [None]:
def extract_salary_from_result(soup): 
    salaries = []
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        try:
            salaries.append(div.find('nobr').text)
        except:
            try:
                div_two = div.find(name='div', attrs={'class':'sjcl'})
                div_three = div_two.find('div')
                salaries.append(div_three.text.strip())
            except:
                salaries.append('Nothing_found')
    return(salaries)

extract_salary_from_result(soup)

['NatWest Group\n\n\n\n3.3',
 'Deloitte\n\n\n\n4.0',
 'Rockwell Automation\n\n\n\n3.9',
 'Mind Works Global',
 'TechnipFMC\n\n\n\n4.0',
 'The Boston Consulting Group\n\n\n\n4.2',
 'ChargePoint\n\n\n\n3.5',
 'E2E Research Pvt. Ltd.',
 'E2E Research Pvt. Ltd.',
 'Winning Minds LLP',
 'RELX Group\n\n\n\n4.1',
 'The Boston Consulting Group\n\n\n\n4.2',
 'Knowledge Excel Services.',
 'Chainlink Technology Private Limited',
 'SourceGain Consulting Pvt Ltd']

In [None]:
def extract_summary_from_result(soup): 
    summaries = []
    spans = soup.findAll('span', attrs={'class': 'summary'})
    for span in spans:
        summaries.append(span.text.strip())
    return(summaries)

extract_summary_from_result(soup)

[]

## Main Code [Compiled]

In [None]:
max_results_per_state = 50
states = ["Andhra+Pradesh","Arunachal+Pradesh ","Assam","Bihar",
             "Chhattisgarh","Goa","Gujarat","Haryana","Himachal+Pradesh",
             "Jammu+and+Kashmir","Jharkhand","Karnataka","Kerala",
             "Madhya+Pradesh","Maharashtra","Manipur","Meghalaya","Mizoram",
             "Nagaland","Odisha","Punjab","Rajasthan","Sikkim","Tamil+Nadu",
             "Telangana","Tripura","Uttar Pradesh","Uttarakhand","West+Bengal",
          "Chandigarh","Delhi","Puducherry"]
col = ['state', 'job_title', 'company_name', 'location', 'summary', 'salary']
sample_df = pd.DataFrame(columns = col)

In [None]:
for state in states:
    for start in range(0, max_results_per_state,10):
        page = requests.get('https://in.indeed.com/jobs?q=data+analyst&l=' + str(state) + '&start=' + str(start))
        soup = BeautifulSoup(page.text, 'html.parser')
        for div in soup.find_all(name='div', attrs={'class':'row'}): 
            #specifying row num for index of job posting in dataframe
            num = (len(sample_df) + 1) 
            #creating an empty list to hold the data for each posting
            job_post = [] 
            #append state name
            job_post.append(state)
            #grabbing job title
            for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
                job_post.append(a['title']) 
            #grabbing company name
            company = div.find_all(name='span', attrs={'class':'company'}) 
            if len(company) > 0: 
                for b in company:
                    job_post.append(b.text.strip()) 
            else: 
                sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
                for span in sec_try:
                    job_post.append(span.text)     #appending company name to job_post
            #grabbing location name
            c = div.findAll('span', attrs={'class': 'location'})  #find all span tags in entry with class = location
            for span in c:  #looking through all span tags...
                job_post.append(span.text)  #appending location name to job_post
            #grabbing summary text
            d = div.findAll('span', attrs={'class': 'summary'})   #find all span tags in entry with class = summary
            for span in d:
                job_post.append(span.text.strip()) #appending summary information to job post
            #grabbing salary data
            try:
                job_post.append(div.find('nobr').text) #if salary info is in 'nobr' tag, grab it,
            except:
                try:
                    div_two = div.find(name="div", attrs={"class":"sjcl"})  #otherwise, look for div tags with class:sjcl
                    div_three = div_two.find("div")  #and then look for div tags within and grab text (which will be salary)
                    job_post.append(div_three.text.strip())
                except:
                    job_post.append("Nothing_found")   #otherwise, note that nothing was found
            #appending contents of job_post to bottom of dataframe
            sample_df.loc[num] = job_post
sample_df.head()

Unnamed: 0,state,job_title,company_name,location,summary,salary
