In [8]:
import csv
from time import sleep
import json
import requests
from bs4 import BeautifulSoup
import re
import s3fs
import pandas as pd

In [2]:
def extract_salary_info(job_title, job_city):
    """The salary website has different url patterns."""
    # URL pattern 1
    template = 'https://www.salary.com/research/salary/posting/{}-salary/{}'

    # Build the url based on search criteria
    url = template.format(job_title, job_city) 
    # print(url)
    
    try:
        response = requests.get(url)
        if response.status_code != 200:
            # URL pattern 2
            template = 'https://www.salary.com/research/salary/alternate/{}-salary/{}'
            url = template.format(job_title, job_city)
            response = requests.get(url)
            if response.status_code != 200:
                # URL pattern 3
                template = 'https://www.salary.com/research/salary/benchmark/{}-salary/{}'
                url = template.format(job_title, job_city)
                response = requests.get(url)
                if response.status_code != 200:
                    return None
    except requests.exceptions.ConnectionError:
        return None
   
    # Parse the html and extract json data
    soup = BeautifulSoup(response.text, 'html.parser')
    pattern = re.compile(r'Occupation')
    script = soup.find('script', {'type': 'application/ld+json'}, text=pattern)
    json_raw = script.contents[0]
    json_data = json.loads(json_raw)

    # Extract salary data
    job_title = json_data['name']
    location = json_data['occupationLocation'][0]['name']
    description = json_data['description']

    ntile_10 = json_data['estimatedSalary'][0]['percentile10']
    ntile_25 = json_data['estimatedSalary'][0]['percentile25']
    ntile_50 = json_data['estimatedSalary'][0]['median']
    ntile_75 = json_data['estimatedSalary'][0]['percentile75']
    ntile_90 = json_data['estimatedSalary'][0]['percentile90']

    data = (job_title, location, description, ntile_10, ntile_25, ntile_50, ntile_75, ntile_90)
    return data

In [3]:
def main():
    job_titles = ['entry-data-analyst','data-scientist-i','machine-learning-engineer','business-intelligence-bi-developer',
              'entry-level-python-developer','ux-design-intern','data-architect-i','big-data-architect',
              'database-administrator-entry','Entry-Business-Systems-Analyst']

    # Get the list of largest us cities.
    # The csv file is saved in salary scraper file.
    with open('largest_cities.csv', newline='') as f:
        reader = csv.reader(f)
        cities = [city for row in reader for city in row]
        #print（cities)
    # Extract salary data for corresponding city
    salary_data = []
    for city in cities:
        for job_title in job_titles:
            result = extract_salary_info(job_title, city)
            if result:
                salary_data.append(result)
                sleep(0.5)
            
    # Save data to csv file
    #with open('salary-results.csv', 'w', newline='', encoding='utf-8') as f:
        #writer = csv.writer(f)
        #writer.writerow(['Title','Location', 'Description', 'nTile10', 'nTile25', 'nTile50', 'nTile75', 'nTile90'])
        #writer.writerows(salary_data)
        
    return salary_data

In [4]:
# Call function
main()

[('Entry Data Analyst',
  'New York, NY',
  'The Entry Data Analyst researches best practices and supports developing the solutions and recommendations for the current business operations. Performs routine business analysis using various techniques, e.g. statistical analysis, explanatory and predictive modeling, data mining. Being an Entry Data Analyst may provide business data interpretation. May work with the internal or external client to identify analytical requirements. In addition, Entry Data Analyst may help to produce ad hoc data and reports. May assist in developing or implementing systems to capture business operation information. Requires a bachelor&#39;s degree. Typically reports to a supervisor or manager. Being an Entry Data Analyst works on projects/matters of limited complexity in a support role. Work is closely managed. Working as an Entry Data Analyst typically requires 0-2 years of related experience.',
  '59417',
  '66701',
  '74701',
  '85201',
  '94761'),
 ('Data 

In [9]:
salary_data = main()

In [10]:
df = pd.DataFrame(salary_data,columns = [['Title','Location', 'Description', 'nTile10', 'nTile25', 'nTile50', 'nTile75', 'nTile90']])
df

Unnamed: 0,Title,Location,Description,nTile10,nTile25,nTile50,nTile75,nTile90
0,Entry Data Analyst,"New York, NY",The Entry Data Analyst researches best practic...,59417,66701,74701,85201,94761
1,Data Scientist I,"New York, NY",Data Scientist I identifies business trends an...,71385,79225,87835,94930,101390
2,Machine Learning Engineer,"New York, NY",Machine learning (ML) is the scientific study ...,120959,133726,147749,163993,178782
3,Business Intelligence (BI) Developer,"New York, NY",Business Intelligence Specialist III creates r...,84662,96747,110021,124339,137376
4,Entry Level Python Developer,"New York, NY","Web Applications Developer I designs, develops...",86240,96633,108048,123254,137098
5,UX Design Intern,"New York, NY",UI/Usability Designer II applies user-centered...,70071,81053,93115,102323,110707
6,Data Architect I,"New York, NY",Data Architect I designs and builds relational...,56510,76862,99216,121634,142045
7,Big Data Architect,"New York, NY",A data architect is a practitioner of data arc...,142697,160526,180108,209540,236337
8,Database Administrator - Entry,"New York, NY",The Database Administrator - Entry implements ...,63651,72300,81800,94800,106636
9,Entry Business Systems Analyst,"New York, NY",The Entry Business Systems Analyst documents s...,59876,67260,75370,84300,92430


In [12]:
###LOAD THE FILE INTO S3####
# prepare csv file name   
pathname = 'ia-final-deployment/'#specify location of s3:/{my-bucket}/
filenames = f"{pathname}salary_data.csv" #name of the filepath and csv file

#encoding must be adjusted to accommodate abnormal characters. Use s3fs to write to S3 bucket
byte_encoded_df = df.to_csv(None, index=False).encode() #encodes file as binary
s3 = s3fs.S3FileSystem(anon=False)
with s3.open(filenames, 'wb') as file:
    file.write(byte_encoded_df) #writes byte-encoded file to s3 location

#print success message
print("Successfull uploaded file to location:"+str(filenames))

Successfull uploaded file to location:ia-final-deployment/salary_data.csv
