## Use BeautifulSoup to scrape a list of PA state park/forest from Wiki

In [3]:
import requests
import urllib.request
import pandas as pd
import csv
from bs4 import BeautifulSoup


In [None]:
# get the url
url = 'https://en.wikipedia.org/wiki/List_of_Pennsylvania_state_parks'
response = requests.get(url)
print(response.status_code) # if 200, okay to scrape

In [4]:
# use Beautiful Soup to get structured data from the page
soup = BeautifulSoup(response.text,"html.parser")
soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of Pennsylvania state parks - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"658d3956-3a06-4bd4-93d3-069c7aebe77f","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_Pennsylvania_state_parks","wgTitle":"List of Pennsylvania state parks","wgCurRevisionId":964550489,"wgRevisionId":964550489,"wgArticleId":1125082,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: BOT: original-url status unknown","Articles with short description","Short description is different 

In [25]:
# find the table in need: first sortable table 
table = soup.findAll('table',{"class":"wikitable sortable"})[0]
# get table rows
tr = table.findAll(['tr']) # a total of 121 parks [& header]
tr[0:2]

[<tr>
 <th scope="col" style="width:*;">Park name
 </th>
 <th scope="col" style="width:*;">County or counties
 </th>
 <th scope="col" style="width:*;">Area in acres (ha)
 </th>
 <th scope="col" style="width:*;">Date <br/>founded
 </th>
 <th class="unsortable" scope="col" style="width:*;">Stream(s) and / or lake(s)
 </th>
 <th class="unsortable" scope="col" style="width:*;">Remarks
 </th>
 <th class="unsortable" scope="col" style="width:*;">Image
 </th></tr>,
 <tr>
 <th scope="row"><a href="/wiki/Allegheny_Islands_State_Park" title="Allegheny Islands State Park">Allegheny Islands State Park</a>
 </th>
 <td><a href="/wiki/Allegheny_County,_Pennsylvania" title="Allegheny County, Pennsylvania">Allegheny County</a></td>
 <td><span data-sort-value="7001430000000000000♠">43</span> acres <br/>(17 ha)</td>
 <td>1980</td>
 <td><a href="/wiki/Allegheny_River" title="Allegheny River">Allegheny River</a></td>
 <td>Three alluvial islands near <a href="/wiki/Pittsburgh" title="Pittsburgh">Pittsburgh<

In [19]:
# write info from the table to a csv file
csvFile = open("state_park_PA.csv",'wt',newline='', encoding='utf-8')
writer = csv.writer(csvFile)  
try:   
    for cell in tr:
        th = cell.find_all('th')
        th_data = [col.text.strip('\n') for col in th]
        td = cell.find_all('td')
        row = [i.text.replace('\n','') for i in td]
        writer.writerow(th_data+row)      
        
finally:   
    csvFile.close()

In [34]:
df_park = pd.read_csv('state_park_PA.csv')
df_park

Unnamed: 0,Park name,County or counties,Area in acres (ha),Date founded,Stream(s) and / or lake(s),Remarks,Image
0,Allegheny Islands State Park,Allegheny County,43 acres (17 ha),1980,Allegheny River,Three alluvial islands near Pittsburgh with no...,
1,Archbald Pothole State Park,Lackawanna County,150 acres (61 ha),1964,,"One of world's largest potholes, 38 ft (12 m) ...",
2,Bald Eagle State Park,Centre County,"5,900 acres (2,388 ha)",1971,"Bald Eagle Creek, Foster Joseph Sayers Reservoir","1,730 acre (700 ha) U.S. Army Corps of Enginee...",
3,Beltzville State Park,Carbon County,"2,973 acres (1,203 ha)",1972,"Pohopoco Creek, Beltzville Lake",U.S. Army Corps of Engineers lake is 949 acres...,
4,Bendigo State Park,Elk County,100 acres (40 ha),1959,East Branch Clarion River,"Only 20 acres (8.1 ha) developed, name a corru...",
...,...,...,...,...,...,...,...
116,Washington Crossing Historic Park,Bucks County,500 acres (202 ha),2016,Delaware River,Site of George Washington's crossing of the De...,
117,Whipple Dam State Park,Huntingdon County,256 acres (104 ha),1928,Whipple Lake,"There was a camp for Boy Scouts, Girl Scouts, ...",
118,White Clay Creek Preserve,Chester County,"1,255 acres (508 ha)",1984,White Clay Creek,"Park was donated by DuPont to preserve ""divers...",
119,Worlds End State Park,Sullivan County,780 acres (316 ha),1932,Loyalsock Creek,"A ""Must See Park"" known for trout fishing, whi...",


## Repeat for PA state forest

In [30]:
url = 'https://en.wikipedia.org/wiki/List_of_Pennsylvania_state_forests'
response = requests.get(url)
print(response.status_code)

200


In [31]:
soup = BeautifulSoup(response.text,"html.parser")
table = soup.findAll('table',{"class":"wikitable sortable"})[0]
table

<table class="wikitable sortable" style="width:95%">
<tbody><tr>
<th width="*"><b>State Forest Name</b>  
</th>
<th width="*"><b>County</b>  
</th>
<th width="*"><b>Area</b><br/>acres (ha)  
</th>
<th width="*"><b>Founded</b>  
</th>
<th width="*"><b>Remarks</b>  
</th></tr>
<tr>
<td><a href="/wiki/Bald_Eagle_State_Forest" title="Bald Eagle State Forest">Bald Eagle</a></td>
<td><a href="/wiki/Centre_County,_Pennsylvania" title="Centre County, Pennsylvania">Centre</a></td>
<td>
<p>193,424 acres (78,280 ha) 
</p>
</td></tr>
<tr>
<td><a href="/wiki/Buchanan_State_Forest" title="Buchanan State Forest">Buchanan</a></td>
<td><a href="/wiki/Fulton_County,_Pennsylvania" title="Fulton County, Pennsylvania">Fulton</a></td>
<td>75,000 acres (30,350 ha)</td>
<td></td>
<td>
</td></tr>
<tr>
<td><a href="/wiki/Clear_Creek_State_Forest" title="Clear Creek State Forest">Clear Creek</a></td>
<td><a href="/wiki/Jefferson_County,_Pennsylvania" title="Jefferson County, Pennsylvania">Jefferson</a></td>
<td>

In [32]:
tr = table.findAll(['tr']) # a total of 20 forests [& header]
csvFile = open("state_forest_PA.csv",'wt',newline='', encoding='utf-8')
writer = csv.writer(csvFile)  
try:   
    for cell in tr:
        th = cell.find_all('th')
        th_data = [col.text.strip('\n') for col in th]
        td = cell.find_all('td')
        row = [i.text.replace('\n','') for i in td]
        writer.writerow(th_data+row)      
        
finally:   
    csvFile.close()

In [33]:
df_forest = pd.read_csv('state_forest_PA.csv')
df_forest

Unnamed: 0,State Forest Name,County,Areaacres (ha),Founded,Remarks
0,Bald Eagle,Centre,"193,424 acres (78,280 ha)",,
1,Buchanan,Fulton,"75,000 acres (30,350 ha)",,
2,Clear Creek,Jefferson,"13,266 acres (5,369 ha)","1919, as Kittanning State Forest",Name changed 2007
3,Cornplanter,Forest,"1,256 acres (508 ha)",,
4,Delaware,Pike and Monroe,"83,519 acres (33,799 ha)",,
5,Elk,Elk,"200,000 acres (80,940 ha)",1900,
6,Forbes,Westmoreland,"over 50,000 acres (20,230 ha)",,
7,Gallitzin,Somerset,"15,336 acres (6,206 ha)",,
8,Loyalsock,Sullivan,"114,552 acres (46,360 ha)","July 1, 2005",Replaced Wyoming State Forest
9,Michaux,Adams,"over 85,000 acres (34,400 ha)",,
