In [1]:
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import re
import requests
import subprocess

In [2]:
def text_from_pdf(pdf_path, temp_path):
    if os.path.exists(temp_path):
        os.remove(temp_path)
    subprocess.call(["pdftotext", pdf_path, temp_path])
    f = open(temp_path, encoding="utf8", errors='ignore')
    text = f.read()
    f.close()
    os.remove(temp_path)
    return text

In [3]:
base_url  = "http://papers.nips.cc"

index_urls = {1987: "https://papers.nips.cc/book/neural-information-processing-systems-1987"}
for i in range(1, 30):
    year = i+1987
    index_urls[year] = "http://papers.nips.cc/book/advances-in-neural-information-processing-systems-%d-%d" % (i, year)


nips_authors = set()
papers = list()
paper_authors = list()
papers_new = list()


for year in sorted(index_urls.keys()):
    index_url = index_urls[year]
    index_html_path = os.path.join("working", "html", str(year)+".html")

    if not os.path.exists(index_html_path):
        r = requests.get(index_url)
        if not os.path.exists(os.path.dirname(index_html_path)):
            os.makedirs(os.path.dirname(index_html_path))
        with open(index_html_path, "wb") as index_html_file:
            index_html_file.write(r.content)
    with open(index_html_path, "rb") as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, "lxml")
    paper_links = [link for link in soup.find_all('a') if link["href"][:7]=="/paper/"]
    print("%d Papers Found" % len(paper_links))

    temp_path = os.path.join("working", "temp.txt")

    for link in paper_links:
        paper_title = link.contents[0]
        info_link = base_url + link["href"]
        pdf_link = info_link + ".pdf"
        pdf_name = link["href"][7:] + ".pdf"
        pdf_path = os.path.join("working", "pdfs", str(year), pdf_name)
        paper_id = re.findall(r"^(\d+)-", pdf_name)[0]
        print(year, " ", paper_id) #paper_title.encode('ascii', 'namereplace'))
        if not os.path.exists(pdf_path):
            pdf = requests.get(pdf_link)
            if not os.path.exists(os.path.dirname(pdf_path)):
                os.makedirs(os.path.dirname(pdf_path))
            pdf_file = open(pdf_path, "wb")
            pdf_file.write(pdf.content)
            pdf_file.close()

        paper_info_html_path = os.path.join("working", "html", str(year), str(paper_id)+".html")
        if not os.path.exists(paper_info_html_path):
            r = requests.get(info_link)
            if not os.path.exists(os.path.dirname(paper_info_html_path)):
                os.makedirs(os.path.dirname(paper_info_html_path))
            with open(paper_info_html_path, "wb") as f:
                f.write(r.content)
        with open(paper_info_html_path, "rb") as f:
            html_content = f.read()
        paper_soup = BeautifulSoup(html_content, "lxml")
        try: 
            abstract = paper_soup.find('p', attrs={"class": "abstract"}).contents[0]
        except:
            print("Abstract not found %s" % paper_title.encode("ascii", "replace"))
            abstract = ""
        authors = [(re.findall(r"-(\d+)$", author.contents[0]["href"])[0],
                    author.contents[0].contents[0])
                   for author in paper_soup.find_all('li', attrs={"class": "author"})]
        for author in authors:
            nips_authors.add(author)
            paper_authors.append([len(paper_authors)+1, paper_id, author[0]])
        event_types = [h.contents[0][23:] for h in paper_soup.find_all('h3') if h.contents[0][:22]=="Conference Event Type:"]
        if len(event_types) != 1:
            #print(event_types)
            #print([h.contents for h in paper_soup.find_all('h3')].__str__().encode("ascii", "replace"))
            #raise Exception("Bad Event Data")
            event_type = ""
        else:
            event_type = event_types[0]
        with open(pdf_path, "rb") as f:
            if f.read(15)==b"<!DOCTYPE html>":
                print("PDF MISSING")
                continue
        #paper_text = text_from_pdf(pdf_path, temp_path)
        papers_new.append([paper_title, abstract])
        #papers.append([paper_id, year, paper_title, event_type, pdf_name, abstract, paper_text])
       
#pd.DataFrame(list(nips_authors), columns=["id","name"]).sort_values(by="id").to_csv("output/authors.csv", index=False)
pd.DataFrame(papers_new, columns=["title", "abstract"]).to_csv("output/papers_new.csv", index=False)
#pd.DataFrame(papers, columns=["id", "year", "title", "event_type", "pdf_name", "abstract", "paper_text"]).sort_values(by="id").to_csv("output/papers.csv", index=False)
#pd.DataFrame(paper_authors, columns=["id", "paper_id", "author_id"]).sort_values(by="id").to_csv("output/paper_authors.csv", index=False)

90 Papers Found
1987   63
1987   80
1987   9
1987   20
1987   66
1987   3
1987   71
1987   83
1987   82
1987   24
1987   40
1987   34
1987   89
1987   70
1987   88
1987   84
1987   55
1987   48
1987   2
1987   31
1987   22
1987   73
1987   16
1987   72
1987   87
1987   8
1987   47
1987   28
1987   15
1987   60
1987   62
1987   13
1987   14
1987   75
1987   44
1987   53
1987   65
1987   78
1987   68
1987   7
1987   26
1987   45
1987   61
1987   39
1987   25
1987   46
1987   59
1987   38
1987   86
1987   19
1987   5
1987   51
1987   49
1987   11
1987   29
1987   77
1987   92
1987   36
1987   18
1987   27
1987   90
1987   64
1987   67
1987   4
1987   17
1987   42
1987   33
1987   91
1987   50
1987   21
1987   10
1987   52
1987   56
1987   37
1987   23
1987   58
1987   69
1987   76
1987   74
1987   1
1987   6
1987   12
1987   30
1987   35
1987   41
1987   32
1987   81
1987   79
1987   85
1987   54
94 Papers Found
1988   106
1988   139
1988   140
1988   175
1988   134
1988   162
1988   149


1993   821
1993   762
1993   839
1993   862
1993   849
1993   786
1993   726
1993   737
1993   775
1993   806
1993   850
1993   854
1993   772
1993   730
1993   804
1993   757
1993   780
1993   717
1993   766
1993   756
1993   868
1993   835
1993   768
1993   783
1993   793
1993   776
1993   777
1993   800
1993   790
1993   782
1993   743
1993   736
1993   848
1993   872
1993   843
1993   788
1993   770
1993   765
1993   865
1993   773
1993   764
1993   742
1993   750
1993   830
1993   769
1993   856
1993   863
1993   761
1993   876
1993   794
1993   779
1993   741
1993   871
1993   758
1993   820
1993   805
1993   826
1993   718
1993   810
1993   778
1993   727
1993   755
1993   877
1993   731
1993   864
1993   729
1993   857
1993   725
1993   834
1993   819
1993   832
1993   842
1993   738
1993   867
1993   771
1993   866
1993   752
1993   809
1993   795
1993   792
1993   858
1993   861
1993   751
1993   851
1993   763
1993   827
1993   787
1993   759
1993   774
1993   852
1993   740

1998   1527
1998   1585
1998   1628
1998   1630
1998   1482
1998   1542
1998   1620
1998   1518
1998   1494
1998   1526
1998   1554
1998   1536
1998   1611
1998   1574
1998   1635
1998   1633
1998   1601
1998   1493
1998   1533
1998   1572
1998   1510
1998   1556
1998   1617
1998   1515
1998   1578
1998   1485
1998   1587
1998   1497
1998   1610
1998   1625
1998   1632
1998   1570
1998   1523
1998   1564
1998   1529
1998   1559
1998   1530
1998   1553
1998   1517
1998   1622
1998   1532
1998   1606
1998   1605
1998   1563
1998   1569
1998   1501
1998   1604
1998   1631
1998   1582
1998   1507
1998   1549
1998   1511
1998   1588
1998   1539
1998   1598
1998   1616
1998   1593
1998   1594
1998   1571
1998   1500
1998   1552
1998   1619
1998   1503
1998   1612
1998   1597
1998   1520
1998   1537
1998   1627
1998   1592
1998   1580
1998   1624
1998   1544
1998   1491
1998   1490
1998   1512
1998   1577
1998   1615
1998   1602
1998   1525
1998   1575
1998   1561
1998   1521
1998   1543
1998

2002   2193
2002   2150
2002   2250
2002   2164
2002   2166
2002   2222
2002   2313
2002   2202
2002   2148
2002   2156
2002   2232
2002   2272
2002   2144
2002   2257
2002   2318
2002   2159
2002   2240
2002   2139
2002   2308
2002   2327
2002   2298
2002   2155
2002   2316
2002   2275
2002   2176
2002   2290
2002   2183
2002   2160
2002   2141
2002   2343
2002   2244
2002   2292
2002   2227
2002   2221
2002   2191
2002   2163
2002   2274
2002   2172
2002   2226
2002   2295
2002   2263
2002   2281
2002   2206
2002   2215
2002   2203
2002   2276
2002   2180
2002   2194
2002   2223
2002   2258
2002   2297
2002   2260
2002   2321
2002   2234
2002   2187
2002   2239
2002   2235
2002   2306
2002   2269
2002   2279
2002   2230
2002   2165
2002   2264
2002   2333
2002   2336
2002   2266
2002   2173
2002   2293
2002   2210
2002   2199
2002   2287
2002   2280
2002   2341
2002   2305
2002   2189
2002   2233
2002   2344
2002   2331
2002   2170
2002   2256
2002   2320
2002   2311
2002   2326
2002

2005   2878
2005   2868
2005   2852
2005   2933
2005   2900
2005   2917
2005   2923
2005   2927
2005   2767
2005   2945
2005   2953
2005   2814
2005   2821
2005   2894
2005   2879
2005   2892
2005   2921
2005   2799
2005   2813
2005   2825
2005   2764
2005   2761
2005   2851
2005   2889
2005   2835
2005   2751
2005   2884
2005   2866
2005   2857
2005   2849
2005   2890
2005   2946
2005   2912
2005   2772
2005   2944
2005   2826
2005   2816
2005   2773
2005   2782
2005   2794
2005   2770
2005   2841
2005   2860
2005   2877
2005   2803
2005   2956
2005   2861
2005   2756
2005   2926
2005   2925
2005   2754
2005   2783
2005   2820
2005   2793
2005   2935
2005   2795
2005   2893
2005   2786
2005   2840
2005   2885
2005   2784
2005   2771
2005   2954
2005   2776
2005   2823
2005   2948
2005   2809
2005   2805
2005   2918
2005   2774
2005   2855
2005   2759
2005   2949
2005   2853
2005   2818
2005   2758
2005   2834
2005   2762
204 Papers Found
2006   3151
2006   3058
2006   2984
2006   3148

2008   3558
2008   3388
2008   3484
2008   3438
2008   3611
2008   3534
2008   3607
2008   3411
2008   3562
2008   3551
2008   3530
2008   3471
2008   3537
2008   3400
2008   3576
2008   3561
2008   3466
2008   3569
2008   3435
2008   3482
2008   3567
2008   3626
2008   3503
2008   3443
2008   3564
2008   3507
2008   3423
2008   3405
2008   3604
2008   3393
2008   3414
2008   3508
2008   3518
2008   3598
2008   3542
2008   3565
2008   3452
2008   3597
2008   3581
2008   3383
2008   3479
2008   3513
2008   3609
2008   3595
2008   3458
2008   3596
2008   3588
2008   3570
2008   3520
2008   3543
2008   3428
2008   3460
2008   3488
2008   3467
2008   3519
2008   3494
2008   3541
2008   3395
2008   3459
2008   3416
2008   3586
2008   3526
2008   3440
2008   3490
2008   3412
2008   3617
2008   3462
2008   3399
2008   3450
2008   3625
262 Papers Found
2009   3689
2009   3812
2009   3857
2009   3690
2009   3861
2009   3639
2009   3753
2009   3742
2009   3637
2009   3691
2009   3631
2009   3868

2011   4458
2011   4418
2011   4187
2011   4212
2011   4373
2011   4234
2011   4254
2011   4188
2011   4434
2011   4250
2011   4281
2011   4424
2011   4249
2011   4421
2011   4350
2011   4253
2011   4366
2011   4390
2011   4292
2011   4318
2011   4217
2011   4455
2011   4391
2011   4356
2011   4257
2011   4256
2011   4320
2011   4304
2011   4379
2011   4204
2011   4428
2011   4211
2011   4388
2011   4375
2011   4371
2011   4301
2011   4460
2011   4247
2011   4412
2011   4245
2011   4400
2011   4218
2011   4186
2011   4429
2011   4401
2011   4377
2011   4342
2011   4255
2011   4453
2011   4284
2011   4282
2011   4384
2011   4232
2011   4467
2011   4325
2011   4475
2011   4345
2011   4197
2011   4340
2011   4299
2011   4198
2011   4438
2011   4300
2011   4465
2011   4271
2011   4334
2011   4486
2011   4470
2011   4309
2011   4220
2011   4399
2011   4213
2011   4315
2011   4285
2011   4260
2011   4462
2011   4248
2011   4359
2011   4203
2011   4347
2011   4270
2011   4328
2011   4225
2011

2013   5141
2013   5092
2013   5162
2013   5114
2013   5108
2013   4870
2013   5121
2013   4964
2013   4941
2013   5154
2013   5047
2013   5183
2013   4936
2013   5057
2013   4953
2013   5006
2013   5212
2013   5081
2013   5135
2013   4900
2013   5011
2013   5216
2013   4862
2013   5192
2013   5076
2013   5123
2013   4954
2013   5005
2013   4919
2013   5150
2013   5214
2013   4970
2013   4924
2013   5023
2013   5002
2013   5074
2013   5028
2013   5027
2013   4948
2013   5142
2013   5072
2013   4885
2013   4940
2013   5003
2013   5046
2013   4922
2013   4986
2013   5152
2013   4872
2013   4959
2013   5160
2013   5104
2013   4892
2013   5161
2013   5116
2013   5087
2013   5163
2013   4942
2013   4893
2013   4969
2013   5091
2013   5130
2013   5151
2013   5010
2013   5133
2013   4931
2013   5073
2013   5172
2013   5171
2013   4894
2013   4961
2013   5149
2013   4996
2013   5164
2013   4962
2013   4899
2013   5210
2013   5101
2013   4890
2013   4988
2013   5126
2013   4985
2013   5022
2013

2014   5439
2014   5253
2014   5298
2014   5290
2014   5571
2014   5244
2014   5429
2014   5339
2014   5246
2014   5227
2014   5605
2014   5418
2014   5276
2014   5296
2014   5522
2014   5271
2014   5352
2014   5526
2014   5560
2014   5452
2014   5603
2014   5372
2014   5388
2014   5297
2014   5543
2014   5254
2014   5303
2014   5375
2014   5481
403 Papers Found
2015   5677
2015   5941
2015   6019
2015   6035
2015   5978
2015   5714
2015   5937
2015   5802
2015   5776
2015   5814
2015   5638
2015   5971
2015   5830
2015   6002
2015   5780
2015   5766
2015   5790
2015   5973
2015   5864
2015   5681
2015   5753
2015   5857
2015   5848
2015   6032
2015   5719
2015   5768
2015   5778
2015   5912
2015   5972
2015   5633
2015   5747
2015   5876
2015   5781
2015   5710
2015   6023
2015   5996
2015   5742
2015   5807
2015   5998
2015   5930
2015   5791
2015   5952
2015   5728
2015   5905
2015   5911
2015   6011
2015   5829
2015   5644
2015   6008
2015   5787
2015   5986
2015   5855
2015   5879

2016   6280
2016   6398
2016   6427
2016   6372
2016   6345
2016   6300
2016   6429
2016   6294
2016   6089
2016   6325
2016   6386
2016   6250
2016   6550
2016   6528
2016   6167
2016   6338
2016   6564
2016   6513
2016   6482
2016   6487
2016   6316
2016   6509
2016   6198
2016   6237
2016   6290
2016   6299
2016   6437
2016   6140
2016   6119
2016   6209
2016   6332
2016   6485
2016   6561
2016   6219
2016   6602
2016   6231
2016   6376
2016   6604
2016   6075
2016   6426
2016   6375
2016   6059
2016   6339
2016   6319
2016   6535
2016   6587
2016   6384
2016   6510
2016   6367
2016   6047
2016   6168
2016   6296
2016   6554
2016   6187
2016   6136
2016   6409
2016   6085
2016   6492
2016   6517
2016   6596
2016   6453
2016   6490
2016   6071
2016   6172
2016   6343
2016   6155
2016   6530
2016   6431
2016   6404
2016   6215
2016   6086
2016   6178
2016   6126
2016   6466
2016   6537
2016   6062
2016   6377
2016   6255
2016   6359
2016   6379
2016   6580
2016   6137
2016   6048
2016