In [1]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import re
import pandas as pd
import numpy as np

In [2]:
def get_links(l, start, end):
    """give the head of the link the start number and the end number and return a list of links to indeed."""
    links = [l + str(i) for i in range(start, end+1, 10)]
    return links

In [3]:
def get_html(link):
    """present the link and return a beautiful soup object that contains the html"""
    url = requests.get(link)
    s = BeautifulSoup(url.content, 'html.parser')
    return s

In [4]:
def get_jobnames(s):
    """return a list a job positions given the soup object"""
    j = []
    for div in s.find_all(name='div', attrs={'class':'row'}):
        for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
            j.append(a['title'])
    return j

In [5]:
def get_comp(s):
    """return a list of company names given the soup object"""
    c = []
    for link in s.find_all('span', attrs = {'class': 'company'}):
        c.append(link.getText().lstrip())
    return c

In [6]:
def get_loc(s):
    """return a list of locations given the soup object"""
    l = []
    for link in s.find_all('div'):
        for key, value, in link.attrs.items():
            if key == 'data-rc-loc':
                l.append(value)
    return l

In [7]:
def make_df(links):
    """create a dataframe of 3 columns: job position, company name, location of the company given a list of 
        links"""
    comps = []
    pos = []
    locs = []
    df = pd.DataFrame()
    
    for idx, link in tqdm(enumerate(links)):
        soup = get_html(link)
        comps.append(get_comp(soup))
        locs.append(get_loc(soup))
        pos.append(get_jobnames(soup))
        df = pd.concat([df, pd.DataFrame([pos[idx], comps[idx], locs[idx]]).T])
    
    df.columns = ['position', 'company', 'location']
    return df

In [25]:
# I checked that 1000 is the 100th page of indeed and its also the last page so these numbers should be fine
#all you need to do is change the parameter q from data scientist to data engineer etc.
df = make_df(get_links('https://www.indeed.com/jobs?q=data+engineer&radius=100&start=', 10, 1000))
df['flag'] = 1
df.head()

100it [02:09,  1.29s/it]


Unnamed: 0,position,company,location,flag
0,SENIOR DATA OPERATIONS ENGINEER,NetWise Data,"San Diego, CA",1
1,Big Data Engineer,Curotec,"Philadelphia, PA",1
2,Data Platform Engineer,KeepTruckin,"San Francisco, CA",1
3,Hadoop Spark Data Engineer,ITI Data,"Chicago, IL",1
4,Systems Engineer,Entre Technology Services,"Bozeman, MT",1


In [27]:
t = df[['position','company','location','flag']].groupby(['position', 'company','location']).sum().sort_values(by = 'flag',ascending = False).reset_index()

### First time writing the dataframe to csv use this following cell

In [None]:
t.to_csv('/Users/dillonquan/Desktop/DataVizProject/indeed_2019.csv',index = False)

### Use this cell when you have already created your csv file

In [28]:
with open('/Users/dillonquan/Desktop/DataVizProject/indeed_2019.csv', 'a') as f:
    t.to_csv(f, header = False, index = False, mode = 'a',line_terminator = '\n')

## STOP HERE!!!

In [None]:
dfl = make_df(get_links('https://www.indeed.com/jobs?q=data+analytics&radius=100&start=', 110, 1000))

In [105]:
dfl['flag'] = 1

In [107]:
d = pd.concat([df, dfl])
d.head()

Unnamed: 0,position,company,location,flag
0,Quality Analyst,Alameda Health System,"Oakland, CA",1
1,122 HEALTHCARE ANALYTICS - Manager Analytics,Alameda Alliance,"Alameda, CA",1
2,Data & Analytics Information Architect,Procter & Gamble,"Cincinnati, OH",1
3,HR Data & Analytics Manager,Anheuser-Busch,"St. Louis, MO",1
4,Faculty Chair - Analytics & Big Data Program,Executive Education Institute,"San Francisco, CA",1


In [108]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1855 entries, 0 to 18
Data columns (total 4 columns):
position    1855 non-null object
company     1854 non-null object
location    1855 non-null object
flag        1855 non-null int64
dtypes: int64(1), object(3)
memory usage: 72.5+ KB


In [109]:
t = d.groupby(['position', 'company','location']).count().sort_values('flag', ascending = False).reset_index()

In [121]:
t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 824 entries, 0 to 823
Data columns (total 4 columns):
position    824 non-null object
company     824 non-null object
location    824 non-null object
flag        824 non-null int64
dtypes: int64(1), object(3)
memory usage: 25.8+ KB


In [124]:
t.to_csv('/Users/dillonquan/Desktop/DataVizProject/indeed_2019.csv',index = False)

# Trial and Errors Just ignore anything following

In [167]:
comps1 = []
soup = get_html(links[4])
for com in soup.find_all('span', attrs = {'class': 'company'}):
    comps1.append(com.getText().lstrip())
comps1

['National Security Agency',
 'Sony Pictures Entertainment Inc.',
 'Grand Rounds',
 'Digitalogy',
 'Pieces Technologies',
 'LIVE OBJECTS',
 'National Security Agency',
 'Adwait',
 'Virta Health',
 'Walmart',
 'Walmart eCommerce',
 'Hireup Resources',
 'Trimark Associates, Inc.',
 'Soaren Management',
 'Verizon',
 'Apple',
 'Seen by Indeed',
 'Valassis Digital']

In [166]:
comps2 = []
soup = get_html(links[8])
for com in soup.find_all('span', attrs = {'class': 'company'}):
    comps2.append(com.getText().lstrip())
comps2

['Soaren Management',
 'Verizon',
 'Grokstream',
 'Facebook',
 'Osmo',
 'Crossover Health',
 'HG Insights',
 'Walmart',
 'Neal Analytics',
 'Brain Corp',
 'Yang2020',
 'Tempus',
 'Disney Streaming Services',
 'Hireup Resources',
 'Trimark Associates, Inc.',
 'Apple',
 'Seen by Indeed',
 'Valassis Digital',
 'National Security Agency']

In [153]:
comps = []
for link in tqdm(links):
    soup = get_html(link)
    for com in soup.find_all('span', attrs = {'class': 'company'}):
        comps.append(com.getText().lstrip())

100%|██████████| 10/10 [00:07<00:00,  1.39it/s]


In [151]:
jobs = []
for link in tqdm(links):
    soup = get_html(link)
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
            jobs.append(a['title'])

In [155]:
locs = []
for link in tqdm(links):
    for loc in soup.find_all('div'):
        for key, value, in loc.attrs.items():
            if key == 'data-rc-loc':
                locs.append(value)

100%|██████████| 10/10 [00:00<00:00, 425.75it/s]


In [84]:
comp = []
for link in soup.find_all('span', attrs = {'class': 'company'}):
    comp.append(link.getText().lstrip())

In [38]:
len(y), len(jobs)

(19, 19)

In [74]:
loc = []
for link in soup.find_all('div'):
    for key, value, in link.attrs.items():
        if key == 'data-rc-loc':
            loc.append(value)

In [116]:
#for link in soup.findAll('a', attrs={'href'}):

#pattern = re.compile("^http://www.indeed.com/pagead*")
l = []
for div in soup.find_all('div', attrs={'class':'title'}):
    for link in div.find_all('a', attrs={'href': re.compile("^/pagead.*")}):
        l.append(link.get('href'))

In [118]:
len(l)

9

In [41]:
companies = []
for div in soup.find_all(name='div', attrs={'class':'row'}):
    company = div.find_all(name='span', attrs={'class':'company'})
    if len(company) > 0:
        for b in company:
            companies.append(b.text.strip())
    else:
        sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
        for span in sec_try:
            companies.append(span.text.strip())