# Import Libraries

* https://chromedriver.storage.googleapis.com/index.html?path=2.40/

In [1]:
import pandas as pd
import numpy as np

import string
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from time import sleep
import time

# Import Jobs and Locations

In [2]:
df = pd.read_csv('2017-09_COMP_input.csv')
df.head()

Unnamed: 0,Dash - Salary.com Job Title,Location
0,Accounts-Payable-Clerk,Goleta-CA
1,Accounts-Payable-Clerk-Sr,Goleta-CA
2,Accounts-Receivable-Clerk,Goleta-CA
3,Credit-and-Collections-Manager,Goleta-CA
4,Accounts-Receivable-Manager,Goleta-CA


# Webscrape Salary.com

In [3]:
## start timer

time_start = time.time()

In [4]:
## function for returning job url
## URL FORMAT+SANTA-BARBARA-CA: http://swz.salary.com/SalaryWizard/Accountant-I-Salary-Details-Santa-Barbara-CA.aspx
## URL FORMAT+DALLAS-TX: http://swz.salary.com/SalaryWizard/Accountant-I-Salary-Details-Dallas-TX.aspx

def get_url_for_job(title, location):
    return 'http://swz.salary.com/SalaryWizard/' + title + '-Salary-Details-' + location + '.aspx'

In [5]:
## webscrape salary.com data

driver = webdriver.Chrome('/Users/ryanrunchey/ds/webscrape/chromedriver')
print('Going to salary.com')
driver.get('http://www.salary.com/category/salary/')
sleep(30)
print('done with short sleep 30 seconds')

Going to salary.com
done with short sleep 30 seconds


In [6]:
labels = ['Scraped', 'Dash - Salary.com Job Title', 'Location', 'City_State', 'Market 25%', 'Median', 'Market 75%']
data_salary = []

df_data_salary = pd.DataFrame(columns=labels)

for i in df.index[:11]: # update to just df.index after test-run
    dash_title = df.iloc[i,0]
    location = df.iloc[i,1]
    url = get_url_for_job(dash_title, location)
    print("--------------------------")
    print('Going to url for title {} {}'.format(dash_title, location))
    driver.get(url)
    print('in for loop before WebDriverWait')
    try:
        print('+ in try to find element by ID')
        element = WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.ID, 'mediansalary')))
    except TimeoutException:
        print('- made it in TimeoutException')
        element = False
        print('element = False')
        city_state = 0
        salary_twentyfive = 0
        salary_median = 0
        salary_seventyfive = 0
        scraped = 0
        data_salary.append([scraped, dash_title, location, city_state, salary_twentyfive, salary_median, salary_seventyfive])
        df_data_salary = pd.concat([df_data_salary, pd.DataFrame(data_salary[-1:], columns=labels)], ignore_index=True)
        print('no data for {} at {}'.format(dash_title, location))
        continue
    finally:
        print('ok, left try')
    if element:
        city_state = driver.find_elements_by_xpath('.//span[@class = "header_location"]')[0].text
        salary_twentyfive = driver.find_element_by_id('marketv25').text
        salary_median = driver.find_element_by_id('mediansalary').text
        salary_seventyfive = driver.find_element_by_id('marketv75').text
        scraped = 1
        data_salary.append([scraped, dash_title, location, city_state, salary_twentyfive, salary_median, salary_seventyfive])
        print('median salary for {} is {} at {}'.format(dash_title, salary_median, location))
        df_data_salary = pd.concat([df_data_salary, pd.DataFrame(data_salary[-1:], columns=labels)], ignore_index=True)

    # print first 5 Salaries Scraped
    if i < 5:
        print('Salaries Scraped: # {}'.format(len(data_salary)))
        print()
        print("data_salary")
        print(data_salary)
        print()        
    
    # periodically save progress
    if i % 10 == 0:
        df_data_salary.to_csv('data_salary_up_to_index_' + str(i) + '.csv')

--------------------------
Going to url for title Accounts-Payable-Clerk Goleta-CA
in for loop before WebDriverWait
+ in try to find element by ID
ok, left try
median salary for Accounts-Payable-Clerk is $42,279 at Goleta-CA
Salaries Scraped: # 1

data_salary
[[1, 'Accounts-Payable-Clerk', 'Goleta-CA', 'Goleta, CA', '$38,125', '$42,279', '$47,312']]

--------------------------
Going to url for title Accounts-Payable-Clerk-Sr Goleta-CA
in for loop before WebDriverWait
+ in try to find element by ID
- made it in TimeoutException
element = False
no data for Accounts-Payable-Clerk-Sr at Goleta-CA
ok, left try
--------------------------
Going to url for title Accounts-Receivable-Clerk Goleta-CA
in for loop before WebDriverWait
+ in try to find element by ID
ok, left try
median salary for Accounts-Receivable-Clerk is $41,996 at Goleta-CA
Salaries Scraped: # 3

data_salary
[[1, 'Accounts-Payable-Clerk', 'Goleta-CA', 'Goleta, CA', '$38,125', '$42,279', '$47,312'], [0, 'Accounts-Payable-Clerk-S

In [7]:
df_data_salary.head()

Unnamed: 0,Scraped,Dash - Salary.com Job Title,Location,City_State,Market 25%,Median,Market 75%
0,1,Accounts-Payable-Clerk,Goleta-CA,"Goleta, CA","$38,125","$42,279","$47,312"
1,0,Accounts-Payable-Clerk-Sr,Goleta-CA,0,0,0,0
2,1,Accounts-Receivable-Clerk,Goleta-CA,"Goleta, CA","$37,710","$41,996","$46,858"
3,1,Credit-and-Collections-Manager,Goleta-CA,"Goleta, CA","$83,686","$96,500","$112,643"
4,1,Accounts-Receivable-Manager,Goleta-CA,"Goleta, CA","$77,287","$90,811","$106,737"


In [8]:
df_data_salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 7 columns):
Scraped                        11 non-null object
Dash - Salary.com Job Title    11 non-null object
Location                       11 non-null object
City_State                     11 non-null object
Market 25%                     11 non-null object
Median                         11 non-null object
Market 75%                     11 non-null object
dtypes: object(7)
memory usage: 696.0+ bytes


## Clean-Up Salaries: Text to Int

In [9]:
def text_cleaner(s):
    if type(s) == str:
        translator = str.maketrans('','',string.punctuation)
        s = s.translate(translator)
        s = s.replace("$", "")
    return s

In [10]:
df_data_salary['Market 25%'] = df_data_salary['Market 25%'].map(text_cleaner).astype(int)
df_data_salary['Median'] = df_data_salary['Median'].map(text_cleaner).astype(int)
df_data_salary['Market 75%'] = df_data_salary['Market 75%'].map(text_cleaner).astype(int)

In [11]:
df_data_salary.head()

Unnamed: 0,Scraped,Dash - Salary.com Job Title,Location,City_State,Market 25%,Median,Market 75%
0,1,Accounts-Payable-Clerk,Goleta-CA,"Goleta, CA",38125,42279,47312
1,0,Accounts-Payable-Clerk-Sr,Goleta-CA,0,0,0,0
2,1,Accounts-Receivable-Clerk,Goleta-CA,"Goleta, CA",37710,41996,46858
3,1,Credit-and-Collections-Manager,Goleta-CA,"Goleta, CA",83686,96500,112643
4,1,Accounts-Receivable-Manager,Goleta-CA,"Goleta, CA",77287,90811,106737


In [12]:
df_data_salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 7 columns):
Scraped                        11 non-null object
Dash - Salary.com Job Title    11 non-null object
Location                       11 non-null object
City_State                     11 non-null object
Market 25%                     11 non-null int64
Median                         11 non-null int64
Market 75%                     11 non-null int64
dtypes: int64(3), object(4)
memory usage: 696.0+ bytes


## Print Webscrape Info

In [13]:
size_input = str(df.index.size)
size_nodata = str((df_data_salary['Median'] == 0).sum())
size_data_salary = str((df_data_salary['Median'] != 0).sum())

print("salaries # \t \t {}".format(size_data_salary))
print("no salary data # \t {}".format(size_nodata))
print("total jobs # \t \t {}".format(size_input))
print()
print(size_data_salary + " of " + size_input + " job salaries downloaded")

salaries # 	 	 8
no salary data # 	 3
total jobs # 	 	 374

8 of 374 job salaries downloaded


## Save to_csv

In [14]:
df_data_salary.to_csv('data_salary.csv')

In [15]:
# print program runtime
time_end = time.time()

def timer(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Calculations Runtime: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

timer(time_start, time_end)

Calculations Runtime: 00:04:12.31
