In [1]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import re
import pandas as pd
import numpy as np

In [2]:
def get_links(l, start, end):
    """give the head of the link the start number and the end number and return a list of links to indeed."""
    links = [l + str(i) for i in range(start, end+1, 10)]
    return links

In [3]:
def get_html(link):
    """present the link and return a beautiful soup object that contains the html"""
    url = requests.get(link)
    s = BeautifulSoup(url.content, 'html.parser')
    return s

In [4]:
def get_jobnames(s):
    """return a list a job positions given the soup object"""
    j = []
    for div in s.find_all(name='div', attrs={'class':'row'}):
        for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
            j.append(a['title'])
    return j

In [5]:
def get_comp(s):
    """return a list of company names given the soup object"""
    c = []
    for link in s.find_all('span', attrs = {'class': 'company'}):
        c.append(link.getText().lstrip())
    return c

In [6]:
def get_loc(s):
    """return a list of locations given the soup object"""
    l = []
    for link in s.find_all('div'):
        for key, value, in link.attrs.items():
            if key == 'data-rc-loc':
                l.append(value)
    return l

In [7]:
def make_df(links):
    """create a dataframe of 3 columns: job position, company name, location of the company given a list of 
        links"""
    comps = []
    pos = []
    locs = []
    df = pd.DataFrame()
    
    for idx, link in tqdm(enumerate(links)):
        soup = get_html(link)
        comps.append(get_comp(soup))
        locs.append(get_loc(soup))
        pos.append(get_jobnames(soup))
        df = pd.concat([df, pd.DataFrame([pos[idx], comps[idx], locs[idx]]).T])
    
    df.columns = ['position', 'company', 'location']
    return df

### Note that the link in get links have an area specified, when you start going through each state change that part of the link

In [134]:
# I checked that 1000 is the 100th page of indeed and its also the last page so these numbers should be fine
#all you need to do is change the parameter q from data scientist to data engineer etc.
df = make_df(get_links('https://www.indeed.com/jobs?q=data+engineer&l=Washington%2C+DC&radius=100&start=', 10, 1000))
df['flag'] = 1
df.head()

100it [01:22,  1.20it/s]


Unnamed: 0,position,company,location,flag
0,Head of Data Visualization Strategy and Engine...,Graphicacy,"Washington, DC",1
1,Princial Data Analysis Engineer,"Wind Talker Innovations, Inc.","Washington, DC",1
2,Sales Application Engineer (HVAC),"AboveAir Technologies, LLC","Frederick, MD",1
3,Microfluidics Engineer,"LumaCyte, LLC","Charlottesville, VA",1
4,AI Data Scientist/NLP Engineer,Guidehouse,"Washington, DC",1


In [135]:
t = df[['position','company','location','flag']].groupby(['position', 'company','location']).sum().sort_values(by = 'flag',ascending = False).reset_index()

In [136]:
t.shape

(839, 4)

### First time writing the dataframe to csv use this following cell

In [None]:
#Note you need to change the path to where you want to create the csv. ALSO!!!! you should probably create you own csv
#file to avoid merge conflict on github
#t.to_csv('/Users/dillonquan/Desktop/DataVizProject/indeed_2019.csv',index = False)

### Use this cell when you have already created your csv file

In [137]:
with open('/Users/dillonquan/Desktop/DataVizProject/indeed_2019.csv', 'a') as f:
    t.to_csv(f, header = False, index = False, mode = 'a',line_terminator = '\n')