<a href="https://colab.research.google.com/github/sisterme3/Data_Science/blob/master/NB9_MovingThroughPages.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Consider the need to collect data from a number of websites. Sometimes we may need to move from page=1 to 2,3,4,5, etc. Or from start=0 to 10,20,30, etc. But other times we may want to move from one category to another, or one person to another, or perhaps one geographic location to another.

### Let's take a look at some examples and how we might use lists, the range function and/or a for loop to move through multiple pages.

### Let's take a look at weather.com

https://weather.com/weather/today/l/37212:4:US


In [1]:
# imports
import pandas as pd
import requests 
from bs4 import BeautifulSoup

# Scrape (get) the website and create the soup
response = requests.get('https://weather.com/weather/today/l/37212:4:US')
html = response.content
soup = BeautifulSoup(html, 'lxml')

# The following lines use beautifulsoup to find the location, temp, and time
# weather is the position of primary interest
weather = soup.find('div',{'class':'today_nowcard'})
location = weather.find('h1', {'class':'h4 today_nowcard-location'}).next_element
temp = weather.find('div', {'class': 'today_nowcard-temp'}).span.next_element
time = weather.find('p', {'class': 'today_nowcard-timestamp'}).find_all('span')[1].next_element

print('Location: ', location, ' Time: ', time, 'Temp: ', temp)

Location:  Nashville, TN  Time:  9:21 am CST Temp:  36


## How would you go about scraping this site 10 times? And pushing the data into a dataframe?

In [3]:
# imports
import pandas as pd
import requests 
from bs4 import BeautifulSoup

# create a list for the weather details
details = []

# use range(10) to repeat the process 10 times
for count in range(10):

  # Scrape (get) the website and create the soup
  response = requests.get('https://weather.com/weather/today/l/37212:4:US')
  html = response.content
  soup = BeautifulSoup(html, 'lxml')

  # The following lines use beautifulsoup to find the location, temp, and time
  # weather is the position of primary interest
  weather = soup.find('div',{'class':'today_nowcard'})
  location = weather.find('h1', {'class':'h4 today_nowcard-location'}).next_element
  temp = weather.find('div', {'class': 'today_nowcard-temp'}).span.next_element
  time = weather.find('p', {'class': 'today_nowcard-timestamp'}).find_all('span')[1].next_element

  details.append({'Location':location,'Temp':temp,'Time':time})

# **** How do we build the dataframe? ****
WeatherData = pd.DataFrame(details)
WeatherData

Unnamed: 0,Location,Temp,Time
0,"Nashville, TN",36,9:21 am CST
1,"Nashville, TN",36,9:21 am CST
2,"Nashville, TN",36,9:21 am CST
3,"Nashville, TN",36,9:21 am CST
4,"Nashville, TN",36,9:21 am CST
5,"Nashville, TN",36,9:21 am CST
6,"Nashville, TN",36,9:21 am CST
7,"Nashville, TN",36,9:21 am CST
8,"Nashville, TN",36,9:21 am CST
9,"Nashville, TN",36,9:21 am CST


## What may be more helpful is to scrape weather.com for 10 different locations. Thus it would be helpful to know the zip codes of the places of interest.

## Suppose we're interested in Nashville weather, but also Atlanta, NYC, Chicago, Los Angeles, Austin, Pittsburgh, and Milwaukee ... with zip codes
37212, 30375, 10001, 60603, 90001, 78702, 15223, and 53228 respectively

In [4]:
# imports
import pandas as pd
import requests 
from bs4 import BeautifulSoup

# **** Let's first create a list of zip codes ****
zipcodes = ['37212','30375','10001','60603','90001','78702','15223','53228']
# Create a list of weather details for multiple locations
details = []

for code in zipcodes:
  # **** Can we use the zipcodes as they are? ****
  
  # Scrape (get) the website and create the soup
  # **** How must we alter the URL in the following line? ****
  response = requests.get('https://weather.com/weather/today/l/'+ code +':4:US')
  html = response.content
  soup = BeautifulSoup(html, 'lxml')

  # The following lines use beautifulsoup to find the location, temp, and time
  # weather is the position of primary interest
  weather = soup.find('div',{'class':'today_nowcard'})
  location = weather.find('h1', {'class':'h4 today_nowcard-location'}).next_element
  temp = weather.find('div', {'class': 'today_nowcard-temp'}).span.next_element
  time = weather.find('p', {'class': 'today_nowcard-timestamp'}).find_all('span')[1].next_element

  details.append({'Location':location,'Temp':temp,'Time':time})

WeatherData = pd.DataFrame(details)
WeatherData

Unnamed: 0,Location,Temp,Time
0,"Nashville, TN",36,9:31 am CST
1,"Atlanta, GA",38,10:38 am EST
2,"Nyc/West 30th, NY",34,10:40 am EST
3,"International Academy of Design and Tech, IL",16,9:29 am CST
4,"Los Angeles, CA",54,7:35 am PST
5,"Austin, TX",45,9:35 am CST
6,"Pittsburgh, PA",29,10:35 am EST
7,"Milwaukee, WI",9,9:39 am CST


### Let's take a look at collegessimply.com

https://www.collegesimply.com/colleges/tennessee/

### Our aim is to list the colleges from several (all) of the states.

In [5]:
# imports
import pandas as pd
import requests 
from bs4 import BeautifulSoup

# Scrape (get) the website and create the soup
response = requests.get('https://www.collegesimply.com/colleges/tennessee/')
html = response.content
soup = BeautifulSoup(html, 'lxml')

# Create a list of colleges
colleges = []

# The following lines use beautifulsoup to find the school name, location, number of students, and tuition
# school is the position of primary interest
for school in soup.find_all('div',{'class':'card-body'}):
  if school.find('h4',{'class':'card-title mb-1'}) != None:
    name = school.find('h4',{'class':'card-title mb-1'}).a.next_element.strip()
  else:
    name = 'Not found'
  if school.find('p', {'class':'card-text small text-muted'})!= None:
    location = school.find('p', {'class':'card-text small text-muted'}).next_element.strip()
  else:
    location = 'Not found'
  if school.find('ul', {'class':'list-unstyled small mb-0'})!= None:
    students = school.find('ul', {'class':'list-unstyled small mb-0'}).find_all('li')[3].strong.string.strip()
  else:
    students = 'Not found'
  if school.find('ul', {'class':'list-unstyled small mb-0'})!= None:
    tuition = school.find('ul', {'class':'list-unstyled small mb-0'}).find_all('li')[4].strong.string.strip()
  else:
    tuition = 'Not found'
  
  colleges.append({'Name':name,'Location':location,'Students':students,'Tuition':tuition})

CollegeData = pd.DataFrame(colleges)
CollegeData


  

Unnamed: 0,Name,Location,Students,Tuition
0,Not found,Not found,Not found,Not found
1,Not found,Not found,Not found,Not found
2,Not found,Not found,Not found,Not found
3,Not found,Not found,Not found,Not found
4,Vanderbilt University,"Nashville, TN • Private 4 Year",12824,"$49,816"
5,Rhodes College,"Memphis, TN • Private 4 Year",2036,"$47,890"
6,Union University,"Jackson, TN • Private 4 Year",3247,"$32,610"
7,Lipscomb University,"Nashville, TN • Private 4 Year",4620,"$32,144"
8,The University of Tennessee,"Knoxville, TN • Public 4 Year",28894,"$13,006"
9,Aquinas College,"Nashville, TN • Private 4 Year",342,"$21,950"


### Let's scrape the first 5 pages of colleges for the state of Tennessee

In [7]:
# imports
import pandas as pd
import requests 
from bs4 import BeautifulSoup

# Create a list of colleges
colleges = []
links = ['1','10','20','30','40']

# **** How do we write an appropriate for loop to move through the pages? ****
for link in links :

  # Scrape (get) the website for which you have interest
  # **** How do we alter the URL to move through multiple pages in the following line? ****
  response = requests.get('https://www.collegesimply.com/colleges/search?sort=&place=&years=4&years=2&type=public&type=private&type=for-profit&gpa=&sat=&act=&admit=comp&field=&major=&radius=300&zip=&state=tennessee&size=&tuition-fees=&net-price=&start='+link)
  html = response.content
  soup = BeautifulSoup(html, 'lxml')

  # The following lines use beautifulsoup to find the school name, location, number of students, and tuition
  # school is the position of primary interest  
  for school in soup.find_all('div',{'class':'card-body'}):
    if school.find('h4',{'class':'card-title mb-1'}) != None:
      name = school.find('h4',{'class':'card-title mb-1'}).a.next_element.strip()
    else:
      name = 'Not found'
    if school.find('p', {'class':'card-text small text-muted'})!= None:
      location = school.find('p', {'class':'card-text small text-muted'}).next_element.strip()
    else:
      location = 'Not found'
    if school.find('ul', {'class':'list-unstyled small mb-0'})!= None:
      students = school.find('ul', {'class':'list-unstyled small mb-0'}).find_all('li')[3].strong.string.strip()
    else:
      students = 'Not found'
    if school.find('ul', {'class':'list-unstyled small mb-0'})!= None:
      tuition = school.find('ul', {'class':'list-unstyled small mb-0'}).find_all('li')[4].strong.string.strip()
    else:
      tuition = 'Not found'
  
    colleges.append({'Name':name,'Location':location,'Students':students,'Tuition':tuition})

CollegeData = pd.DataFrame(colleges)

# The following lines will drop duplicate rows and reset the index to 0,1,2, ... ,'n-1' (where 'n' is the number of rows)
CollegeData.drop_duplicates(keep=False, inplace=True)
CollegeData.reset_index(inplace=True,drop=True)

CollegeData

Unnamed: 0,Name,Location,Students,Tuition
0,Vanderbilt University,"Nashville, TN • Private 4 Year",12824,"$49,816"
1,Rhodes College,"Memphis, TN • Private 4 Year",2036,"$47,890"
2,Union University,"Jackson, TN • Private 4 Year",3247,"$32,610"
3,Lipscomb University,"Nashville, TN • Private 4 Year",4620,"$32,144"
4,The University of Tennessee,"Knoxville, TN • Public 4 Year",28894,"$13,006"
5,Aquinas College,"Nashville, TN • Private 4 Year",342,"$21,950"
6,Bryan College,"Dayton, TN • Private 4 Year",1363,"$26,800"
7,Belmont University,"Nashville, TN • Private 4 Year",8260,"$34,310"
8,Milligan College,"Milligan College, Tennessee • Private 4 Year",1208,"$28,730"
9,Trevecca Nazarene University,"Nashville, TN • Private 4 Year",3927,"$25,598"


### Let's choose a number of states for which we will scrape the first 5 pages of their colleges

Perhaps alabama, alaska, arizona, arkansas, california, colorado, tennessee, texas, utah, vermont, and wyoming

In [10]:
# imports
import pandas as pd
import requests 
from bs4 import BeautifulSoup

# Create a list of states
states = ['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'tennessee', 'texas', 'utah', 'vermont', 'wyoming']
links = ['1','10','20','30','40']
# Create the list of colleges
colleges = []

# **** How do we determine our for loop(s) to get 5 pages for each of our states of interest? ****
for state in states:
  for link in links:
    # Scrape (get) the website for which you have interest
    # **** How do we alter the URL to move through multiple pages in the following line? ****
    response = requests.get('https://www.collegesimply.com/colleges/search?sort=&place=&years=4&years=2&type=public&type=private&type=for-profit&gpa=&sat=&act=&admit=comp&field=&major=&radius=300&zip=&state='+state+'&size=&tuition-fees=&net-price=&start='+link)
    html = response.content
    soup = BeautifulSoup(html, 'lxml')

    # The following lines use beautifulsoup to find the school name, location, number of students, and tuition
    # school is the position of primary interest
    for school in soup.find_all('div',{'class':'card-body'}):
      if school.find('h4',{'class':'card-title mb-1'}) != None:
        name = school.find('h4',{'class':'card-title mb-1'}).a.next_element.strip()
      else:
        name = 'Not found'
      if school.find('p', {'class':'card-text small text-muted'})!= None:
        location = school.find('p', {'class':'card-text small text-muted'}).next_element.strip()
      else:
        location = 'Not found'
      if school.find('ul', {'class':'list-unstyled small mb-0'})!= None:
        students = school.find('ul', {'class':'list-unstyled small mb-0'}).find_all('li')[3].strong.string.strip()
      else:
        students = 'Not found'
      if school.find('ul', {'class':'list-unstyled small mb-0'})!= None:
        tuition = school.find('ul', {'class':'list-unstyled small mb-0'}).find_all('li')[4].strong.string.strip()
      else:
        tuition = 'Not found'
    
      colleges.append({'Name':name,'Location':location,'Students':students,'Tuition':tuition})

CollegeData = pd.DataFrame(colleges)
CollegeData.drop_duplicates(keep='last', inplace=True)
CollegeData.reset_index(inplace=True,drop=True)
CollegeData

Unnamed: 0,Name,Location,Students,Tuition
0,The University of Alabama,"Tuscaloosa, AL • Public 4 Year",38390,"$10,780"
1,Birmingham Southern College,"Birmingham, AL • Private 4 Year",1268,"$17,650"
2,Auburn University,"Auburn University, AL • Public 4 Year",30440,"$11,276"
3,Samford University,"Birmingham, AL • Private 4 Year",5619,"$31,650"
4,University of Alabama at Huntsville,"Huntsville, AL • Public 4 Year",9736,"$10,714"
...,...,...,...,...
404,Sheridan College,"Sheridan, WY • Public 2 Year",4168,"$3,396"
405,Western Wyoming Community College,"Rock Springs, WY • Public 2 Year",3183,"$2,953"
406,University of Phoenix Cheyenne Campus,"Cheyenne, Wyoming • Private 4 Year",20,"$10,240"
407,College America Cheyenne,"Cheyenne, WY • Private 4 Year",45,"$16,968"
