# Web Scraping - Indeed.com
General steps for Web Scraping
1. Check whether the website allows web scraping
2. Obtain the source code (HTML File) by using the website URL
3. Download the website content
4. Parse the content using keywords tags for elements of interest
5. Extract relevant data/features
6. Organize raw data in structured format (e.g., CSV)

### Instal Firefox, Selenium, Gecko Driver, Beautiful Soup

In [None]:
#Install firefox
!apt-get update
!apt install firefox

#Install selenium
!pip install selenium

#Updating and installing firefox libraries
!apt-get update && apt-get install -y wget bzip2 libxtst6 libgtk-3-0 libx11-xcb-dev libdbus-glib-1-2 libxt6 libpci-dev && rm -rf /var/lib/apt/lists/*

#Installing Geck Driver
!wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
!tar -xvzf geckodriver*
!chmod +x geckodriver
!export PATH=$PATH:/path-to-extracted-file/.

#Instal beautifulsoup
!pip install beautifulsoup4

### Import Dependencies

In [None]:
import selenium.webdriver as webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options as FirefoxOptions

import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By

import random
import time

### Define Position and Location

In [None]:
## Enter a job position
position = "data+scientist"
## Enter a location (City, State or Zip or remote)
locations = "remote"

def get_url(position, location):
    url_template = "https://www.indeed.com/jobs?q={}&l={}"
    url = url_template.format(position, location)
    return url

url = get_url(position, locations)
dataframe = pd.DataFrame(columns=["Title", "Company", "Location", "Rating", "Date", "Salary", "Description", "Links"])
print(url)

https://www.indeed.com/jobs?q=engineer&l=united+states


### Set Path to Webdriver

In [None]:
driver_path = '/content/geckodriver'
firefox_driver_path = '/content/geckodriver'
user_agent = 'Mozilla'
firefox_options = FirefoxOptions()
firefox_options.add_argument('--headless')
driver = webdriver.Firefox(options=firefox_options)

### Scrape Job Postings

In [None]:
## Number of postings to scrape
postings = 100

jn=0
for i in range(0, postings, 10):
    driver.get(url + "&start=" + str(i))
    driver.implicitly_wait(3)

    jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
    #print(jobs)

    for job in jobs:
        #print(job)
        result_html = job.get_attribute('innerHTML')
        #print(result_html)
        soup = BeautifulSoup(result_html, 'html.parser')
        #print(soup , '\n')

        jn += 1

        liens = job.find_elements(By.TAG_NAME, "a")
        #print(liens)
        links = liens[0].get_attribute("href")
        #print(links)

        title = soup.select('.jobTitle')[0].get_text().strip()
        print(title)

        #company = soup.find_all(attrs={'data-testid': 'company-name'})[0].get_text().strip()
        #print(company)
        try:
            company = soup.find_all(attrs={'data-testid': 'company-name'})[0].get_text().strip()
            #print(company)
        except:
            company = 'Nan'
        print(company)
        #location = soup.select('.companyLocation')[0].get_text().strip() #origional
        #location = soup.select('.company_location')[0].get_text().strip()
        location = soup.find_all(attrs={'data-testid': 'text-location'})[0].get_text().strip()
        print(location)
        try:
            salary = soup.select('.salary-snippet-container')[0].get_text().strip()
        except:
            salary = 'NaN'
        try:
            rating = soup.select('.ratingNumber')[0].get_text().strip()
        except:
            rating = 'NaN'
        try:
            date = soup.select('.date')[0].get_text().strip()
        except:
            date = 'NaN'
        try:
            description = soup.select('.job-snippet')[0].get_text().strip()
        except:
            description = ''

        dataframe = pd.concat([dataframe, pd.DataFrame([{'Title': title,
                                          "Company": company,
                                          'Location': location,
                                          'Rating': rating,
                                          'Date': date,
                                          "Salary": salary,
                                          "Description": description,
                                          "Links": links}])], ignore_index=True)
        print("Job number {0:4d} added - {1:s}".format(jn,title))

In [None]:
dataframe.head()

### Scrape Full Job Descriptions

In [None]:
Links_list = dataframe['Links'].tolist()
#Links_list

In [None]:
descriptions=[]
for i in Links_list:
    driver.get(i)
    driver.implicitly_wait(random.randint(3, 8))
    jd = driver.find_element(By.XPATH, '//div[@id="jobDescriptionText"]').text
    descriptions.append(jd)
    time.sleep(random.randint(5,10))

dataframe['Descriptions'] = descriptions

### Save Results

In [None]:
# Convert the dataframe to a csv file
date = datetime.today().strftime('%Y-%m-%d')
dataframe.to_csv(date + "_" + position + "_" + locations + ".csv", index=False)

In [None]:
dataframe.head()

Unnamed: 0,Title,Company,Location,Rating,Date,Salary,Description,Links,Descriptions
0,Rotating Equipment Engineer,bp,"Blaine, WA 98230",,PostedPosted 5 days ago,,Up to 10% travel should be expected with this ...,https://www.indeed.com/rc/clk?jk=c11c88e8b3b91...,Location\nUnited States of America - Washingto...
1,Early Career Mechanical Engineer,WSP,"Seattle, WA 98104 (Downtown area)",,PostedPosted 7 days ago,"$66,400 - $99,600 a year",Takes part in providing local technical assist...,https://www.indeed.com/rc/clk?jk=1b44b5c6de43e...,"Who We Are\nAt WSP, we are driven by inspiring..."
2,Mechanical Engineer,bp,"Blaine, WA 98230",,PostedPosted 5 days ago,,Up to 10% travel should be expected with this ...,https://www.indeed.com/rc/clk?jk=98a8771ed0b90...,Location\nUnited States of America - Washingto...
3,Process Development Engineer,ExxonMobil,"Baytown, TX",,PostedPosted 30+ days ago,,The process development engineer may also focu...,https://www.indeed.com/rc/clk?jk=6f44c7e104f3e...,"About us\n\nAt ExxonMobil, our vision is to le..."
4,Mechanical Engineer,BASF Corporation,"Bishop, TX 78343",,PostedPosted 5 days ago,,"As a valued member, you will develop and imple...",https://www.indeed.com/rc/clk?jk=da2ff63a9541e...,"Now Hiring! Mechanical Engineer\nBishop, TX\nT..."
5,TD Quality and Reliability Engineer,INTEL,"Santa Clara, CA",,PostedPosted 18 days ago,"$118,860 - $196,720 a year","Develops, applies, and maintains quality and r...",https://www.indeed.com/rc/clk?jk=f7b6e6478fc14...,"Job Description\n\nDevelops, applies, and main..."
6,Test Engineer,Baker Hughes,"Deer Park, TX 77536",,PostedPosted 21 days ago,,Collaborating with Mechanical and Electrical e...,https://www.indeed.com/rc/clk?jk=ca12e7e42bdc5...,Test Engineer\nAre you inspired to work in the...
7,Reservoir Engineer,Endeavor Energy Resources,"Midland, TX",,PostedToday,,Utilize knowledge of relevant reservoir engine...,https://www.indeed.com/rc/clk?jk=08396f657fdfc...,As one of the largest employers in the Permian...
8,Early Career Field Engineer,WSP,"Federal Way, WA",,PostedPosted 7 days ago,"$66,400 - $99,600 a year",WSP is currently initiating a search for a Ful...,https://www.indeed.com/rc/clk?jk=ab46725155691...,"Who We Are\nAt WSP, we are driven by inspiring..."
9,Associate Engineer,Energy Transfer Family of Partnerships,"Ganado, TX",,PostedToday,,Build and maintain facility process simulation...,https://www.indeed.com/rc/clk?jk=3b7c5fa6b7ade...,Summary:\nThe Associate Engineer will provide ...
