In [1]:
#Import Libraries
from bs4 import BeautifulSoup
from html2text import html2text
from tqdm import tqdm_notebook
import pandas as pd
import pprint as pp
import requests
import time

In [10]:
#Extract Job URL Function
def extract_job_urls_from_result(soup): 
    urls = []
    for div in soup.find_all(name = "div", attrs = {"class":"row"}):
        for a in div.find_all(name = "a", attrs = {"data-tn-element":"jobTitle"}):
            this_url = a['href']
            to_go_url = "https://www.indeed.com/viewjob" + this_url[7:]
            urls.append(to_go_url)
    return(urls)

In [11]:
#Extract Job Title & Description
def extract_text_from_jobURL(url):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    
    #Retrive the job title
    b = soup.find('b','jobtitle')
    if b != None:        
        #Retrive the job description
        table = soup.find('table', id = 'job-content')
        span = table.find('span', id = 'job_summary')

        div = span.find('div')

        #Because in some posts there isn't exist the div class
        if div==None:
            div = span

        #Cleaning (remove newllines, * , spaces)
        title = html2text(str(b)).strip().replace('*', '')
        #clean = html2text(str(div)).strip().replace('*','').replace('#', '')
        clean = ' '.join(html2text(str(div)).replace('*','').replace('#', '').split())
    
        return(title, clean)
    else:
        return([])

In [12]:
#Import the Job Title Dataset
job_titles = pd.read_csv("..\..\Datasets\Job_Titles.csv", sep = ",")

#Create job_titles list
job_list = list(job_titles.Job_titles)

In [13]:
#Create Listings URLs - a list of (query_job, url) tuples

#Initialize the url job list and city list
listing_urls_list = []
city_list = ['New+York', 'London']

for city in city_list:
    for job in job_list:
        #for start in range(0,20,10):
            #URL="http://www.indeed.com/jobs?q="+ job.replace(' ', '+') + "%2420%2C000&l=" + str(city) + "&start=" + str(start)
        URL="http://www.indeed.com/jobs?q="+ job.replace(' ', '+') + "%2420%2C000&l=" + str(city) + "&limit=20"
        listing_urls_list.append((job, URL))

In [15]:
#Main loop

#Initialize the dataframe
columns = ["ID", "Query", "Job Title", "Description"]
df = pd.DataFrame(columns = columns)

n = 0
max_items = 1500 #30000 to get all

#Visited urls set
visited = set()
check_visited = True

#Jupyter Progress Bar init
pbar = tqdm_notebook(total = max_items, desc = 'Loading Jobs')

#Outer loop - go over the query results (many jobs in each)
for query_job, url in listing_urls_list:

    if n != max_items:
        if (check_visited and url in visited):
            continue
        
        #Conducting a request of the stated Job Listings URL above:
        page = requests.get(url)

        #Specifying a desired format of “page” using the html parser
        soup = BeautifulSoup(page.text, "html.parser")

        #Retrieve the specific job urls from this listing
        job_urls_found = extract_job_urls_from_result(soup)

        #Check if there is a job in indeed
        if job_urls_found != []: #found some jobs for this query
            
            #Inner loop - retrieve the title and the description for each job found
            for item in job_urls_found:

                #Extract job info (title and description)
                job_info = extract_text_from_jobURL(item)

                #Add a slight delay to avoid getting blocked
                #time.sleep(0.2)

                #Add it as a new row to the dataframe
                if job_info != []:
                    n += 1
                    if (check_visited):
                        visited.add(url)
                    pbar.update(1) #update progress bar
                    df.loc[len(df)] = [n, query_job, job_info[0], job_info[1]]

                if n == max_items:
                    pbar.close()
                    break
    else: #all downloaded, stop
        break

HBox(children=(IntProgress(value=0, description='Loading Jobs', max=1500), HTML(value='')))

In [16]:
#Create a csv file
df.to_csv("../Datasets/results.csv", index = False)