# Web scraping data from payscale.com

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# payscale url
payscale_url = r"https://www.payscale.com/college-salary-report/majors-that-pay-you-back/bachelors/page/"

In [3]:
# getting all the pages and create a list of soup objects for each page
payscale_soups = []
for page_num in range(1, 35):
    
    payscale_response = requests.get(f"{payscale_url}{page_num}")
    payscale_response.raise_for_status()
    payscale_soup = BeautifulSoup(payscale_response.text, "html.parser")
    payscale_soups.append(payscale_soup)

In [8]:
# create pandas dataframe
payscale_df = pd.DataFrame(columns=["Rank", "Major", "Early Career Pay", "Mid-Career Pay", "% High Meaning"])

In [9]:
payscale_df.head()

Unnamed: 0,Rank,Major,Early Career Pay,Mid-Career Pay,% High Meaning


In [12]:
# get data from soup

# looping all pages
for soup in payscale_soups:
  # getting all the rows in the page
  rows = soup.find_all(name="tr", class_="data-table__row")

  # looping through rows to get the cell data
  for row in rows:
    # cells contains all the cell data
    cells = row.find_all(name="span", class_="data-table__value")

    # get the cell data and add it to new_entry
    new_entry = []
    for cell in cells:
      cell_data = cell.getText()
      new_entry.append(cell_data.replace("$", "").replace(",", "").replace("%", ""))
    new_entry.remove("Bachelors") # remove bachelors
    
    payscale_df.loc[len(payscale_df)] = new_entry


In [23]:
# replace '-' with NaN
import numpy as np

payscale_df.replace("-", np.nan)

Unnamed: 0,Rank,Major,Early Career Pay,Mid-Career Pay,% High Meaning
0,1,Petroleum Engineering,93200,187300,67
1,2,Operations Research & Industrial Engineering,84800,170400,28
2,3,Electrical Engineering & Computer Science (EECS),108500,159300,46
3,4,Interaction Design,68300,155800,55
4,5,Public Accounting,59800,147700,47
...,...,...,...,...,...
822,823,Outdoor Education,37400,46300,52
823,824,Early Childhood Education,36100,45400,78
824,825,Mental Health,36900,45000,
825,826,Medical Assisting,36000,44800,


In [24]:
print(len(payscale_df))
print(payscale_df.head())

827
  Rank                                             Major Early Career Pay  \
0    1                             Petroleum Engineering            93200   
1    2      Operations Research & Industrial Engineering            84800   
2    3  Electrical Engineering & Computer Science (EECS)           108500   
3    4                                Interaction Design            68300   
4    5                                 Public Accounting            59800   

  Mid-Career Pay % High Meaning  
0         187300             67  
1         170400             28  
2         159300             46  
3         155800             55  
4         147700             47  


In [26]:
# convert dataframe into csv file
payscale_df.to_csv("bachelors_salaries_data_2022.csv", index=False)