In [2]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
import csv

def extract_web_data(pagenum):
    """Extracts and returns a single webpage of prespecified search criteria on Indeed"""
    # scrape customized search for job postings where the title contains the word "intern," posted any time, located
    # in the U.S., sorted by post date, given the page number of the search results as a parameter. 
    site_req = requests.get('https://www.indeed.com/jobs?q=title%3Aintern&l=United+States&sort=date&limit=20&radius=25&start={pagenum}')
    indeed_soup = BeautifulSoup(site_req.content, 'html.parser')
    # return 200 code if the requests worked
    print(site_req.status_code)
    return indeed_soup

def parse_web_data(): 
    """calls the previous function to loop through multiple pages of Indeed search data, parsing through the results
    and ultimately appending that data to a list"""
    # initialize an empty list to hold the webpage data from each of the first 200 pages of search results 
    page_list = []
    # initialize an empty list to hold just the essential data from each job posting for the dataset we're creating
    job_list = []
    # loop through the first 200 pages of search results, calling the extract_web_data function on each to get the soup
    for num in range (1,200):
        soup = extract_web_data(num)
        # append the data from each of the 200 pages to a list
        page_list.append(soup)
        # use Beautiful Soup library to find each new job posting, which will have a 'div' tage
        divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
        for item in divs:
            # loop through each job posting to grab the job title, company name, and location
            title = item.find('a').text.strip()
            company = item.find('span', class_ = 'company').text.strip()
            location = item.find('div', class_ = 'recJobLoc').get('data-rc-loc')
            # last, grab salary information from the job posting ONLY IF it exists
            try:
                salary = item.find('span', class_ = 'salaryText').text.strip()
            except: 
                salary = ''
            # create a list of dictionaries to hold the job listing data 
            job_dict = {'title': title, 'company': company, 'location': location, 'salary': salary}
            job_list.append(job_dict)
    # return the list of dictionaries
    return job_list

def get_job_dataframe():
    """calls the previous function and converts the job data to a pandas dataframe"""
    job_data = parse_web_data()
    df = pd.DataFrame(job_data)
    return df

get_job_dataframe()

def webdata_to_csv():
    """calls the previous function to get a pandas dataframe, then writes that data to a csv"""
    df = get_job_dataframe()
    df.to_csv('intern_listings_sample_200.csv')

webdata_to_csv()