# Scraping the contents from NBER website

In [1]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import random

In [2]:
url = 'https://www.nber.org/papers/w00272'
# Establishing the connection to the web page:
response = requests.get(url)

# You can use status codes to understand how the target server responds to your request.
# Ex., 200 = OK, 400 = Bad Request, 403 = Forbidden, 404 = Not Found.
print('Status Code: ',response.status_code)

# Pull the HTML string out of requests and convert it to a Python string.
html = response.text

# The first 700 characters of the content.
print("\nFirst part of HTML document fetched as string:\n")
print(html[:700])

Status Code:  200

First part of HTML document fetched as string:

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"  "http://www.w3.org/TR/html4/loose.dtd">
<html prefix="og: http://ogp.me/ns#"><head><title>The Social Security Earnings Test, Labor Supply Distortions, and Foregone Payroll Tax Revenues</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<meta name="citation_title" content="The Social Security Earnings Test, Labor Supply Distortions, and Foregone Payroll Tax Revenues">
<meta name="keyword" content="Anthony J. Pellechio">
<meta name="citation_author" content="Pellechio, Anthony J">
<meta name="DC.Date" content="1978-08-01">
<meta name="citation_date" content="1978-08-01">
<meta name="citation_publication_d


In [3]:
soup = BeautifulSoup(html, 'lxml')

In [4]:
# Singular element:
soup.html.title

<title>The Social Security Earnings Test, Labor Supply Distortions, and Foregone Payroll Tax Revenues</title>

In [5]:
print(soup.html.title.text)

The Social Security Earnings Test, Labor Supply Distortions, and Foregone Payroll Tax Revenues


In [6]:
# items_class = []
for item in soup.find('p', class_='bibtop').findAll("b"):
#     items_class.append(item)
    print(item)
    #for x in item.find("href"):
        #print(x)

<b>NBER Working Paper No. 272</b>
<b>Issued in August 1978</b>
<b>NBER Program(s):<a href="https://www.nber.org/papersbyprog/.html"></a>, <a href="https://www.nber.org/papersbyprog/PE.html">Public Economics</a></b>


In [7]:
find_b = soup.find('p', class_='bibtop').findAll("b")

In [8]:
for c in find_b[2].find_all("a"):
    print(c.get_text())


Public Economics


In [9]:
df_final_title = pd.read_csv("../data/merged_wo_prog.csv")
df_final_title.head()

Unnamed: 0,paper,year,month,title,author
0,1,1973,June,"Education, Information, and Efficiency",['Finis Welch']
1,2,1973,June,Hospital Utilization: An Analysis of SMSA Diff...,['Barry R Chiswick']
2,3,1973,June,Error Components Regression Models and Their A...,['Swarnjit S Arora']
3,4,1973,July,Human Capital Life Cycle of Earnings Models: A...,['Lee A Lillard']
4,5,1973,July,A Life Cycle Family Model,['James P Smith']


In [10]:
def process_code(x):
    if x >=10000:
        return 'w' + str(x)
    elif 1000 <= x < 10000:
        return 'w0' + str(x)
    elif 100 <= x < 1000:
        return 'w00' + str(x)
    elif 10 <= x <100:
        return 'w000' + str(x)
    else: 
        return 'w0000' + str(x)

In [13]:
df_final_title['code'] = df_final_title['paper'].map(process_code)

In [14]:
df_final_title.head(20)

Unnamed: 0,paper,year,month,title,author,code
0,1,1973,June,"Education, Information, and Efficiency",['Finis Welch'],w00001
1,2,1973,June,Hospital Utilization: An Analysis of SMSA Diff...,['Barry R Chiswick'],w00002
2,3,1973,June,Error Components Regression Models and Their A...,['Swarnjit S Arora'],w00003
3,4,1973,July,Human Capital Life Cycle of Earnings Models: A...,['Lee A Lillard'],w00004
4,5,1973,July,A Life Cycle Family Model,['James P Smith'],w00005
5,6,1973,July,A Review of Cyclical Indicators for the United...,['Victor Zarnowitz'],w00006
6,7,1973,August,The Definition and Impact of College Quality,['Lewis C Solmon'],w00007
7,8,1973,September,Multinational Firms and the Factor Intensity o...,"['Merle Yahr Weiss', 'Robert E Lipsey']",w00008
8,9,1973,September,From Age-Earnings Profiles to the Distribution...,['Lee A Lillard'],w00009
9,10,1973,September,Monte Carlo for Robust Regression: The Swindle...,['Paul W Holland'],w00010


In [15]:
df_final_title['code'][272:]

272      w00274
273      w00275
274      w00276
275      w00277
276      w00278
          ...  
26581    w26596
26582    w26597
26583    w26598
26584    w26599
26585    w26600
Name: code, Length: 26314, dtype: object

In [137]:
test_list = list(df_final_title['code'][18000:21000])
base_url = 'https://www.nber.org/papers/'

In [138]:
type(test_list)

list

In [139]:
prog_list=[]
for x in test_list:
    url = base_url + x
    print(url)
    response = requests.get(url, headers={'User-agent': 'Preety'})
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    find_b = soup.find('p', class_='bibtop').findAll("b")
#     print(find_b)
    prog_per_doc = []
    if len(find_b) <= 2:
        prog_per_doc.append("")
    else:
        for c in find_b[2].find_all("a"):
            #print(c.get_text())        
            prog_per_doc.append(c.get_text())
    prog_list.append(prog_per_doc)
# generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,3)
#     print(sleep_duration)
    time.sleep(sleep_duration)   

https://www.nber.org/papers/w18014
https://www.nber.org/papers/w18015
https://www.nber.org/papers/w18016
https://www.nber.org/papers/w18017
https://www.nber.org/papers/w18018
https://www.nber.org/papers/w18019
https://www.nber.org/papers/w18020
https://www.nber.org/papers/w18021
https://www.nber.org/papers/w18022
https://www.nber.org/papers/w18023
https://www.nber.org/papers/w18024
https://www.nber.org/papers/w18025
https://www.nber.org/papers/w18026
https://www.nber.org/papers/w18027
https://www.nber.org/papers/w18028
https://www.nber.org/papers/w18029
https://www.nber.org/papers/w18030
https://www.nber.org/papers/w18031
https://www.nber.org/papers/w18032
https://www.nber.org/papers/w18033
https://www.nber.org/papers/w18034
https://www.nber.org/papers/w18035
https://www.nber.org/papers/w18036
https://www.nber.org/papers/w18037
https://www.nber.org/papers/w18038
https://www.nber.org/papers/w18039
https://www.nber.org/papers/w18040
https://www.nber.org/papers/w18041
https://www.nber.org

https://www.nber.org/papers/w18249
https://www.nber.org/papers/w18250
https://www.nber.org/papers/w18251
https://www.nber.org/papers/w18252
https://www.nber.org/papers/w18253
https://www.nber.org/papers/w18254
https://www.nber.org/papers/w18255
https://www.nber.org/papers/w18256
https://www.nber.org/papers/w18257
https://www.nber.org/papers/w18258
https://www.nber.org/papers/w18259
https://www.nber.org/papers/w18260
https://www.nber.org/papers/w18261
https://www.nber.org/papers/w18262
https://www.nber.org/papers/w18263
https://www.nber.org/papers/w18264
https://www.nber.org/papers/w18265
https://www.nber.org/papers/w18266
https://www.nber.org/papers/w18267
https://www.nber.org/papers/w18268
https://www.nber.org/papers/w18269
https://www.nber.org/papers/w18270
https://www.nber.org/papers/w18271
https://www.nber.org/papers/w18272
https://www.nber.org/papers/w18273
https://www.nber.org/papers/w18274
https://www.nber.org/papers/w18275
https://www.nber.org/papers/w18276
https://www.nber.org

https://www.nber.org/papers/w18484
https://www.nber.org/papers/w18485
https://www.nber.org/papers/w18486
https://www.nber.org/papers/w18487
https://www.nber.org/papers/w18488
https://www.nber.org/papers/w18489
https://www.nber.org/papers/w18490
https://www.nber.org/papers/w18491
https://www.nber.org/papers/w18492
https://www.nber.org/papers/w18493
https://www.nber.org/papers/w18494
https://www.nber.org/papers/w18495
https://www.nber.org/papers/w18496
https://www.nber.org/papers/w18497
https://www.nber.org/papers/w18498
https://www.nber.org/papers/w18499
https://www.nber.org/papers/w18500
https://www.nber.org/papers/w18501
https://www.nber.org/papers/w18502
https://www.nber.org/papers/w18503
https://www.nber.org/papers/w18504
https://www.nber.org/papers/w18505
https://www.nber.org/papers/w18506
https://www.nber.org/papers/w18507
https://www.nber.org/papers/w18508
https://www.nber.org/papers/w18509
https://www.nber.org/papers/w18510
https://www.nber.org/papers/w18511
https://www.nber.org

https://www.nber.org/papers/w18719
https://www.nber.org/papers/w18720
https://www.nber.org/papers/w18721
https://www.nber.org/papers/w18722
https://www.nber.org/papers/w18723
https://www.nber.org/papers/w18724
https://www.nber.org/papers/w18725
https://www.nber.org/papers/w18726
https://www.nber.org/papers/w18727
https://www.nber.org/papers/w18728
https://www.nber.org/papers/w18729
https://www.nber.org/papers/w18730
https://www.nber.org/papers/w18731
https://www.nber.org/papers/w18732
https://www.nber.org/papers/w18733
https://www.nber.org/papers/w18734
https://www.nber.org/papers/w18735
https://www.nber.org/papers/w18736
https://www.nber.org/papers/w18737
https://www.nber.org/papers/w18738
https://www.nber.org/papers/w18739
https://www.nber.org/papers/w18740
https://www.nber.org/papers/w18741
https://www.nber.org/papers/w18742
https://www.nber.org/papers/w18743
https://www.nber.org/papers/w18744
https://www.nber.org/papers/w18745
https://www.nber.org/papers/w18746
https://www.nber.org

https://www.nber.org/papers/w18954
https://www.nber.org/papers/w18955
https://www.nber.org/papers/w18956
https://www.nber.org/papers/w18957
https://www.nber.org/papers/w18958
https://www.nber.org/papers/w18959
https://www.nber.org/papers/w18960
https://www.nber.org/papers/w18961
https://www.nber.org/papers/w18962
https://www.nber.org/papers/w18963
https://www.nber.org/papers/w18964
https://www.nber.org/papers/w18965
https://www.nber.org/papers/w18966
https://www.nber.org/papers/w18967
https://www.nber.org/papers/w18968
https://www.nber.org/papers/w18969
https://www.nber.org/papers/w18970
https://www.nber.org/papers/w18971
https://www.nber.org/papers/w18972
https://www.nber.org/papers/w18973
https://www.nber.org/papers/w18974
https://www.nber.org/papers/w18975
https://www.nber.org/papers/w18976
https://www.nber.org/papers/w18977
https://www.nber.org/papers/w18978
https://www.nber.org/papers/w18979
https://www.nber.org/papers/w18980
https://www.nber.org/papers/w18981
https://www.nber.org

https://www.nber.org/papers/w19189
https://www.nber.org/papers/w19190
https://www.nber.org/papers/w19191
https://www.nber.org/papers/w19192
https://www.nber.org/papers/w19193
https://www.nber.org/papers/w19194
https://www.nber.org/papers/w19195
https://www.nber.org/papers/w19196
https://www.nber.org/papers/w19197
https://www.nber.org/papers/w19198
https://www.nber.org/papers/w19199
https://www.nber.org/papers/w19200
https://www.nber.org/papers/w19201
https://www.nber.org/papers/w19202
https://www.nber.org/papers/w19203
https://www.nber.org/papers/w19204
https://www.nber.org/papers/w19205
https://www.nber.org/papers/w19206
https://www.nber.org/papers/w19207
https://www.nber.org/papers/w19208
https://www.nber.org/papers/w19209
https://www.nber.org/papers/w19210
https://www.nber.org/papers/w19211
https://www.nber.org/papers/w19212
https://www.nber.org/papers/w19213
https://www.nber.org/papers/w19214
https://www.nber.org/papers/w19215
https://www.nber.org/papers/w19216
https://www.nber.org

https://www.nber.org/papers/w19424
https://www.nber.org/papers/w19425
https://www.nber.org/papers/w19426
https://www.nber.org/papers/w19427
https://www.nber.org/papers/w19428
https://www.nber.org/papers/w19429
https://www.nber.org/papers/w19430
https://www.nber.org/papers/w19431
https://www.nber.org/papers/w19432
https://www.nber.org/papers/w19433
https://www.nber.org/papers/w19434
https://www.nber.org/papers/w19435
https://www.nber.org/papers/w19436
https://www.nber.org/papers/w19437
https://www.nber.org/papers/w19438
https://www.nber.org/papers/w19439
https://www.nber.org/papers/w19440
https://www.nber.org/papers/w19441
https://www.nber.org/papers/w19442
https://www.nber.org/papers/w19443
https://www.nber.org/papers/w19444
https://www.nber.org/papers/w19445
https://www.nber.org/papers/w19446
https://www.nber.org/papers/w19447
https://www.nber.org/papers/w19448
https://www.nber.org/papers/w19449
https://www.nber.org/papers/w19450
https://www.nber.org/papers/w19451
https://www.nber.org

https://www.nber.org/papers/w19659
https://www.nber.org/papers/w19660
https://www.nber.org/papers/w19661
https://www.nber.org/papers/w19662
https://www.nber.org/papers/w19663
https://www.nber.org/papers/w19664
https://www.nber.org/papers/w19665
https://www.nber.org/papers/w19666
https://www.nber.org/papers/w19667
https://www.nber.org/papers/w19668
https://www.nber.org/papers/w19669
https://www.nber.org/papers/w19670
https://www.nber.org/papers/w19671
https://www.nber.org/papers/w19672
https://www.nber.org/papers/w19673
https://www.nber.org/papers/w19674
https://www.nber.org/papers/w19675
https://www.nber.org/papers/w19676
https://www.nber.org/papers/w19677
https://www.nber.org/papers/w19678
https://www.nber.org/papers/w19679
https://www.nber.org/papers/w19680
https://www.nber.org/papers/w19681
https://www.nber.org/papers/w19682
https://www.nber.org/papers/w19683
https://www.nber.org/papers/w19684
https://www.nber.org/papers/w19685
https://www.nber.org/papers/w19686
https://www.nber.org

https://www.nber.org/papers/w19894
https://www.nber.org/papers/w19895
https://www.nber.org/papers/w19896
https://www.nber.org/papers/w19897
https://www.nber.org/papers/w19898
https://www.nber.org/papers/w19899
https://www.nber.org/papers/w19900
https://www.nber.org/papers/w19901
https://www.nber.org/papers/w19902
https://www.nber.org/papers/w19903
https://www.nber.org/papers/w19904
https://www.nber.org/papers/w19905
https://www.nber.org/papers/w19906
https://www.nber.org/papers/w19907
https://www.nber.org/papers/w19908
https://www.nber.org/papers/w19909
https://www.nber.org/papers/w19910
https://www.nber.org/papers/w19911
https://www.nber.org/papers/w19912
https://www.nber.org/papers/w19913
https://www.nber.org/papers/w19914
https://www.nber.org/papers/w19915
https://www.nber.org/papers/w19916
https://www.nber.org/papers/w19917
https://www.nber.org/papers/w19918
https://www.nber.org/papers/w19919
https://www.nber.org/papers/w19920
https://www.nber.org/papers/w19921
https://www.nber.org

https://www.nber.org/papers/w20129
https://www.nber.org/papers/w20130
https://www.nber.org/papers/w20131
https://www.nber.org/papers/w20132
https://www.nber.org/papers/w20133
https://www.nber.org/papers/w20134
https://www.nber.org/papers/w20135
https://www.nber.org/papers/w20136
https://www.nber.org/papers/w20137
https://www.nber.org/papers/w20138
https://www.nber.org/papers/w20139
https://www.nber.org/papers/w20140
https://www.nber.org/papers/w20141
https://www.nber.org/papers/w20142
https://www.nber.org/papers/w20143
https://www.nber.org/papers/w20144
https://www.nber.org/papers/w20145
https://www.nber.org/papers/w20146
https://www.nber.org/papers/w20147
https://www.nber.org/papers/w20148
https://www.nber.org/papers/w20149
https://www.nber.org/papers/w20150
https://www.nber.org/papers/w20151
https://www.nber.org/papers/w20152
https://www.nber.org/papers/w20153
https://www.nber.org/papers/w20154
https://www.nber.org/papers/w20155
https://www.nber.org/papers/w20156
https://www.nber.org

https://www.nber.org/papers/w20364
https://www.nber.org/papers/w20365
https://www.nber.org/papers/w20366
https://www.nber.org/papers/w20367
https://www.nber.org/papers/w20368
https://www.nber.org/papers/w20369
https://www.nber.org/papers/w20370
https://www.nber.org/papers/w20371
https://www.nber.org/papers/w20372
https://www.nber.org/papers/w20373
https://www.nber.org/papers/w20374
https://www.nber.org/papers/w20375
https://www.nber.org/papers/w20376
https://www.nber.org/papers/w20377
https://www.nber.org/papers/w20378
https://www.nber.org/papers/w20379
https://www.nber.org/papers/w20380
https://www.nber.org/papers/w20381
https://www.nber.org/papers/w20382
https://www.nber.org/papers/w20383
https://www.nber.org/papers/w20384
https://www.nber.org/papers/w20385
https://www.nber.org/papers/w20386
https://www.nber.org/papers/w20387
https://www.nber.org/papers/w20388
https://www.nber.org/papers/w20389
https://www.nber.org/papers/w20390
https://www.nber.org/papers/w20391
https://www.nber.org

https://www.nber.org/papers/w20599
https://www.nber.org/papers/w20600
https://www.nber.org/papers/w20601
https://www.nber.org/papers/w20602
https://www.nber.org/papers/w20603
https://www.nber.org/papers/w20604
https://www.nber.org/papers/w20605
https://www.nber.org/papers/w20606
https://www.nber.org/papers/w20607
https://www.nber.org/papers/w20608
https://www.nber.org/papers/w20609
https://www.nber.org/papers/w20610
https://www.nber.org/papers/w20611
https://www.nber.org/papers/w20612
https://www.nber.org/papers/w20613
https://www.nber.org/papers/w20614
https://www.nber.org/papers/w20615
https://www.nber.org/papers/w20616
https://www.nber.org/papers/w20617
https://www.nber.org/papers/w20618
https://www.nber.org/papers/w20619
https://www.nber.org/papers/w20620
https://www.nber.org/papers/w20621
https://www.nber.org/papers/w20622
https://www.nber.org/papers/w20623
https://www.nber.org/papers/w20624
https://www.nber.org/papers/w20625
https://www.nber.org/papers/w20626
https://www.nber.org

https://www.nber.org/papers/w20834
https://www.nber.org/papers/w20835
https://www.nber.org/papers/w20836
https://www.nber.org/papers/w20837
https://www.nber.org/papers/w20838
https://www.nber.org/papers/w20839
https://www.nber.org/papers/w20840
https://www.nber.org/papers/w20841
https://www.nber.org/papers/w20842
https://www.nber.org/papers/w20843
https://www.nber.org/papers/w20844
https://www.nber.org/papers/w20845
https://www.nber.org/papers/w20846
https://www.nber.org/papers/w20847
https://www.nber.org/papers/w20848
https://www.nber.org/papers/w20849
https://www.nber.org/papers/w20850
https://www.nber.org/papers/w20851
https://www.nber.org/papers/w20852
https://www.nber.org/papers/w20853
https://www.nber.org/papers/w20854
https://www.nber.org/papers/w20855
https://www.nber.org/papers/w20856
https://www.nber.org/papers/w20857
https://www.nber.org/papers/w20858
https://www.nber.org/papers/w20859
https://www.nber.org/papers/w20860
https://www.nber.org/papers/w20861
https://www.nber.org

In [140]:
prog_list

[['Economics of Aging'],
 ['International Finance and Macroeconomics', 'Monetary Economics'],
 ['Children', 'Economics of Education', 'Health Economics'],
 ['Industrial Organization', 'Productivity, Innovation, and Entrepreneurship'],
 ['Industrial Organization',
  'International Trade and Investment',
  'Law and Economics',
  'Public Economics'],
 ['Productivity, Innovation, and Entrepreneurship'],
 ['Industrial Organization', 'Productivity, Innovation, and Entrepreneurship'],
 ['Development of the American Economy', 'Monetary Economics'],
 ['Economic Fluctuations and Growth'],
 ['Health Care'],
 ['Asset Pricing', 'Corporate Finance'],
 ['Health Care', 'Productivity, Innovation, and Entrepreneurship'],
 ['Health Economics'],
 ['Corporate Finance',
  'Development of the American Economy',
  'Monetary Economics'],
 ['International Finance and Macroeconomics'],
 ['Environment and Energy Economics', 'Public Economics'],
 ['International Finance and Macroeconomics'],
 ['Economic Fluctuatio

In [141]:
len(prog_list)

3000

In [142]:
df_code = pd.DataFrame(df_final_title['code'][18000:21000], columns=['code'])

In [143]:
df_code

Unnamed: 0,code
18000,w18014
18001,w18015
18002,w18016
18003,w18017
18004,w18018
...,...
20995,w21009
20996,w21010
20997,w21011
20998,w21012


In [144]:
df_code['prog'] =  prog_list

In [145]:
df_code

Unnamed: 0,code,prog
18000,w18014,[Economics of Aging]
18001,w18015,"[International Finance and Macroeconomics, Mon..."
18002,w18016,"[Children, Economics of Education, Health Econ..."
18003,w18017,"[Industrial Organization, Productivity, Innova..."
18004,w18018,"[Industrial Organization, International Trade ..."
...,...,...
20995,w21009,"[Public Economics, Political Economy]"
20996,w21010,"[Economic Fluctuations and Growth, Internation..."
20997,w21011,"[Development of the American Economy, , Produc..."
20998,w21012,"[Development Economics, Health Economics, Indu..."


In [25]:
# df_final_title['prog'] = "" #do this only the first time

In [146]:
df_final_title.loc[18000:21000, 'prog'] = df_code['prog']

In [147]:
df_final_title[18000:21000].to_csv('18to21k.csv')