# GET PDFS FROM SEARCH QUERY (GOOGLE): CONVERT PDFS TO TEXT

In [80]:
import re
import os
import time
import pandas as pd
from os import listdir, getcwd
from os.path import isfile, join
import certifi
import urllib3
import pickle
import requests
from bs4 import BeautifulSoup
import tika
from tika import parser
from nltk import sent_tokenize, word_tokenize
from IPython.display import clear_output
import nltk.data
from ipywidgets import IntProgress
from IPython.display import display
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


### Steps:
- Get links by search term: search for links to pdfs matching search term
- Get pdfs by url: using links, download pdfs into local directory
- Get text from pdfs: parse pdfs from local directory to a dataframe

## GET PDF LINKS BY SEARCH TERM

In [26]:


def get_links(search_term, num_results):
    
    '''gets pdf links from google based on search term'''
    
    page = requests.get("https://www.google.com/search?q={}+filetype%3A+pdf&num={}".format(
                    search_term, num_results))

    soup = BeautifulSoup(page.content, "lxml")
    # links = soup.findAll("a", href=True)
    links = soup.findAll('a', attrs={'href': re.compile("\.pdf")})
    pdf_urls = [str(link).split('/url?q=',1)[1].split('.pdf')[0]+'.pdf' 
                for link in links if '/search' not in str(link)]
    
    return pdf_urls



## GET PDFS BY URL

In [109]:
def get_pdf_from_web(url):
    
    '''
    takes a single url and downloads pdfs to pdf_dir
    '''
    
    chunk_size = 100
    file = url.split('/')[-1]
    path = getcwd() + '/pdf_dir/' + file

    # http = urllib3.PoolManager()
    
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
    
    r = http.request('GET', url, preload_content=False)

    with open(path, 'wb') as out:
        while True:
            data = r.read(chunk_size)
            if not data:
                break
            out.write(data)

    r.release_conn()

In [112]:
## feeder

def web_dir_pdf_feeder(pdf_urls):
    
    '''takes a list of urls and passes them to get_pdf function to download pdfs from web'''
    
    max_count = len(pdf_urls)
    f = IntProgress(min=0, max=max_count) # instantiate the bar
    print('downloading {} pdfs...'.format(max_count))
    display(f) # display the bar
    
    count=0
    
    for pdf in pdf_urls:
        
        
        f.value+=1
        
        count+=1
        
        try:
            get_pdf_from_web(pdf)
            
        except Exception as e:
            print(e)

## EXTRACT PDF TO TEXT

In [5]:
def get_files_from_dir(mypath):
    
    '''args: my path - default is cwd
    function returns list of pdf file names from directory'''
    
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    pdf_file_names = [file for file in onlyfiles if '.pdf' in file]
    # file = "/"+ pdf_file_names # here to change to iterate through a directory of pdfs
    # fp=open(mypath+file,'rb');
    return pdf_file_names

In [79]:
def pdf2text(path):
# tika.initVM()

    files = get_files_from_dir(path)
    max_count = len(files)
    print("processing {} pdfs...".format(max_count))

    dict_meta_pdf={}
    dict_content_pdf={}
    sentence=[]
    sentences=[]
    
    f = IntProgress(min=0, max=max_count) # instantiate the bar
    
    display(f) # display the bar

    for idx,file in enumerate(files):
    
        # for progress bar
        max_count = len(files)
        f.value+=1
    
        # set path to file
        full_path = path + "/" + file
        
        try:
            parsed = parser.from_file(full_path)
        except ConnectionError:
            sleep(3)
        # parsed = parser.from_file(full_path)
        if parsed['status'] != 422:
            # dict_meta_pdf[idx]= parsed["metadata"]
            content = re.findall(r"[\w']+|[.,!?;]", parsed["content"])
    
        for word in content:
            if word != '.':
                sentence.append(word)
                content = []

            elif len(sentence)>10:
                sentences.append(" ".join(sentence + ['.']).replace(' .','.').replace(' ,', ',').replace(' ?','?'))
                sentence=[]
            
        dict_content_pdf[idx] = sentences
        sentences = []
    
    dict_={}
    for i in range(len(dict_content_pdf)):
        dict_[i] = pd.Series(" ".join(dict_content_pdf[i]))

    pdf_texts = pd.DataFrame.from_dict(dict_, orient='index')[0]

    return pdf_texts, files

## RUN STEPS

In [74]:
search_term = 'climate change'
num_results = 100
pdf_directory = '/pdf_dir'
path = getcwd() + pdf_directory

In [75]:
%time pdf_urls = get_links(search_term, num_results)

CPU times: user 2.52 s, sys: 2.63 ms, total: 2.52 s
Wall time: 3.09 s


In [113]:
%time web_dir_pdf_feeder(pdf_urls) # download pdfs to file

downloading 53 pdfs...


IntProgress(value=0, max=53)

'ftp'
CPU times: user 14 s, sys: 897 ms, total: 14.9 s
Wall time: 2min 42s


In [84]:
%time pdf_texts, files = pdf2text(path)

processing 47 pdfs...


IntProgress(value=0, max=47)

2018-11-12 17:51:35,290 [MainThread  ] [WARNI]  Tika server returned status: 422
2018-11-12 17:51:43,806 [MainThread  ] [WARNI]  Tika server returned status: 422
2018-11-12 17:51:45,981 [MainThread  ] [WARNI]  Tika server returned status: 422


CPU times: user 1.98 s, sys: 368 ms, total: 2.35 s
Wall time: 29.7 s


## SAVE DATA

In [11]:
words = [len(i.split()) for i in pdf_texts]
df = pd.DataFrame({'name':files,'content':pdf_texts, 'word_count':words})
df.to_pickle('../Data/pdfs_{}.pkl'.format(search_term))
# pd.read_pickle('save_content.p')