In [11]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time

In [12]:
# Parameters for Indeed scraper: 
jobTitle = '"Software Engineer"' # Use double quotes inside of single quotes for exact match on the title
location = 'United States'
maxJobs = 200 # Can only grab jobs in increments of 10 for now.
# Going to remove salaryEstimate for now because so many jobs don't list a salary. 
# We can filter by salary after the data is collected.
# salaryEstimate = "50" # This number represents thousands of dollars. i.e. 50 = $50,000

# Target URL Format: https://www.indeed.com/jobs?q=Data+Scientist&l=New+York

def makeURLs(jobTitle, location, maxJobs):
    
    numPages = maxJobs / 10
    numPages = int(numPages)
    
    root = 'http://www.indeed.com/jobs?q='
    
    jobTitle = jobTitle.replace(' ', '+')
    location = location.replace(' ', '+')
    
    URLs = []
    
    pageNum = 0
    for page in range(numPages):
    
        if pageNum == 0:
            seedURL = root + jobTitle + '&l=' + location
        else:
            jobNum = pageNum*10
            jobNum = str(jobNum)
            seedURL = root + jobTitle + '&l=' + location + '&start=' + jobNum
        pageNum = pageNum + 1
        URLs.append(seedURL)
        
    return URLs

seedURLs = makeURLs(jobTitle, location, maxJobs)

In [15]:
#Get URL for every job listed on the page. We'll remove duplicates later.

# Get list of links to Listing
def extract_listing_url_from_seed(URLs): 
    
    listingURLs = []
    
    for seedURL in URLs:
        print(seedURL)
         
        # Get HTML for each Seed URL to extract links to job listing pages.
        page = requests.get(seedURL)
        time.sleep(1) # One second pause between requests.
        soup = BeautifulSoup(page.text, 'html.parser')
        
        # Loop through html and append to list all page URLs linked to by job title.
        for div in soup.find_all(name='div', attrs={'class':'row'}):
            for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
                #print(a['href'])
                listingURLs.append(a['href'])
            
    #Remove URLs containing the string '/pagead/' or '/company'      
    listingURLs = [ x for x in listingURLs if '/pagead/' not in x ]
    listingURLs = [ x for x in listingURLs if '/company/' not in x ]
    
    return listingURLs

finalURLs = extract_listing_url_from_seed(seedURLs)

listingURLs = []

for url in finalURLs:
    URL = 'http://www.indeed.com' + url
    listingURLs.append(URL)
    
# Create dataframe to store urls    
pd.set_option('display.max_columns', 500)  
pd.set_option('display.max_colwidth', 100) 
df = pd.DataFrame()
listing_urls = pd.Series(listingURLs)
df['listing_url'] = listing_urls.values

df.head()

http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=10
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=20
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=30
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=40
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=50
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=60
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=70
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=80
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=90
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=100
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=110
http://www.indeed.com/jobs?q="Software+Engineer"&l=United+States&start=120
http://www.indeed.com/jobs?q="Software+Engin

Unnamed: 0,listing_url
0,http://www.indeed.com/rc/clk?jk=acec0008c6542da0&fccid=5a31ef999afce663&vjs=3
1,http://www.indeed.com/rc/clk?jk=f68563da3f0597a9&fccid=e9e363d6332bc1cd&vjs=3
2,http://www.indeed.com/rc/clk?jk=4b9111710c4bc473&fccid=aeb15e43a6800b9d&vjs=3
3,http://www.indeed.com/rc/clk?jk=937b1f1f803a9870&fccid=4db2ddf023fb6496&vjs=3
4,http://www.indeed.com/rc/clk?jk=1a61b0a758b3ecf9&fccid=53c1573dc4db9cab&vjs=3


In [16]:
df.shape

(183, 1)

In [19]:
job_titles = []
    
companies = []

locations = []

summaries = []

for url in finalURLs:
    
    # Get HTML of the Page
    URL = 'http://www.indeed.com/' + url
    page = requests.get(URL)
    time.sleep(1)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Get Job Title
    for b in soup.find_all(name='b', attrs={'class': 'jobtitle'}):
        job_titles.append(b.text)
    
    # Get Company Name
    for div in soup.find_all(name='div', attrs={'data-tn-component':'jobHeader'}):
        for span in div.find_all(name='span', attrs={'class': 'company'}):
            companies.append(span.text)
            
    # Get Location
#     for div in soup.find_all(name='div', attrs={'data-tn-component':'jobHeader'}):
#         for span in div.find_all(name='span', attrs={'class': 'location'}):
#             locations.append(span.text)
    location = soup.find('input', {'id': 'where'}).get('value')
    locations.append(location)

    # Get Summary
    for span in soup.find_all(name='span', attrs={'class': 'summary'}):
        summaries.append(span.text)   
            
# Lists -> Series to prepare for DataFrame
job_titles = pd.Series(job_titles)
companies = pd.Series(companies)
locations = pd.Series(locations)
summaries = pd.Series(summaries)

# Assign column titles and set values
df['job_title'] = job_titles.values
df['company'] = companies.values
df['location'] = locations.values
df['summary'] = summaries.values

df.head(20)

# Export to CSV
df.to_csv('SE_US_small.csv')

In [20]:
df.head()

Unnamed: 0,listing_url,job_title,company,location,summary
0,http://www.indeed.com/rc/clk?jk=acec0008c6542da0&fccid=5a31ef999afce663&vjs=3,Full Stack Software Engineer,Nine Summer,Connecticut,This position is on the product team responsible for a leading SaaS product that is used by orga...
1,http://www.indeed.com/rc/clk?jk=f68563da3f0597a9&fccid=e9e363d6332bc1cd&vjs=3,Software Engineer 1,Juniper Networks,"Sunnyvale, CA",Software Engineer—Backend\nDid you spend four or more years working hard towards your education ...
2,http://www.indeed.com/rc/clk?jk=4b9111710c4bc473&fccid=aeb15e43a6800b9d&vjs=3,Software Engineer Asc,Lockheed Martin,Connecticut,This Lockheed Martin Enterprise Business Services (EBS) Software Engineer position is a Java / C...
3,http://www.indeed.com/rc/clk?jk=937b1f1f803a9870&fccid=4db2ddf023fb6496&vjs=3,Senior Software Engineer,Rebric,"Denver, CO",Rebric is looking for a Senior Software Engineer to help shape and build a team that embraces a ...
4,http://www.indeed.com/rc/clk?jk=1a61b0a758b3ecf9&fccid=53c1573dc4db9cab&vjs=3,Software Engineer,General Electric,"West Melbourne, FL","About Us:\nGE is the world's Digital Industrial Company, transforming industry with software-def..."
