-
Notifications
You must be signed in to change notification settings - Fork 2
/
crawl.py
151 lines (122 loc) · 4.09 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
## A lot of this code is informed by this multi-threading of web-grabbing example:
# https://github.com/NikolaiT/GoogleScraper/blob/master/Examples/image_search.py
# Probably the parallel architecture sucks, probably dask.bag mapping would be more readable and efficient.
##
#import threading,requests, os, urllib
from bs4 import BeautifulSoup
from natsort import natsorted, ns
import glob
import requests
import os
import selenium
#from pyvirtualdisplay import Display
from selenium import webdriver
import pandas as pd
import pycld2 as cld2
import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
import re
import numpy as np
from bs4 import BeautifulSoup
import bs4 as bs
import urllib.request
from delver import Crawler
C = Crawler()
CWD = os.getcwd()
from io import StringIO
import io
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
codec = 'utf-8'
device = TextConverter(rsrcmgr, retstr, laparams = laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
def convert_pdf_to_txt(content):
try:
pdf = io.BytesIO(content.content)
except:
pdf = io.BytesIO(content)
parser = PDFParser(pdf)
document = PDFDocument(parser, password=None) # this fails
write_text = ''
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
write_text += retstr.getvalue()
#write_text = write_text.join(retstr.getvalue())
# Process all pages in the document
text = str(write_text)
return text
def html_to_txt(content):
soup = BeautifulSoup(content, 'html.parser')
#strip HTML
for script in soup(["script", "style"]):
script.extract() # rip it out
text = soup.get_text()
#organize text
lines = (line.strip() for line in text.splitlines()) # break into lines and remove leading and trailing space on each
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # break multi-headlines into a line each
text = '\n'.join(chunk for chunk in chunks if chunk) # drop blank lines
str_text = str(text)
return str_text
def print_best_text(fileName):
file = open(fileName)
return text
def denver_to_text(url):
fileName = C.download(local_path=CWD, url=url, name='temp_file')
file = open(fileName)
if str('.html') in fileName:
text = html_to_txt(file)
else:
text = convert_pdf_to_txt(file)
file.close()
return text
def collect_hosted_files(url):
'''
Used for scholar
'''
#print(url)
try:
crude_html = denver_to_text(url)
except:
driver.get(url)
crude_html = driver.page_source
#soup0 = BeautifulSoup(crude_html, 'html.parser')
soup = BeautifulSoup(crude_html, 'lxml')
links = []
print(soup)
for link in soup.findAll('a'):check_out = link.get('href');links.append(check_out)
#print(link)
for link in soup.findAll('a', attrs={'href': re.compile("https://")}):
check_out = link.get('href')
#if '/citations?' in check_out:
links.append(check_out)
for link in soup.findAll('a', attrs={'href': re.compile("http://")}):
check_out = link.get('href')
#if '/citations?' in check_out:
links.append(check_out)
return links
def collect_pubs(url):
'''
Used for scholar
'''
from scrape import get_driver
driver = get_driver()
driver.get(url)
crude_html = driver.page_source
soup = BeautifulSoup(crude_html, 'html.parser')
links = []
for link in soup.findAll('a', attrs={'href': re.compile("https://")}):
check_out = link.get('href')
#if '/citations?' in check_out:
links.append(check_out)
for link in soup.findAll('a', attrs={'href': re.compile("http://")}):
check_out = link.get('href')
#if '/citations?' in check_out:
links.append(check_out)
return links