# Extract Information from the Web Snapshots

Web snapshots of Google Scholar searches were saved to a Zotero Group Library. Each snapshot is a single page of the search results.

From each of the saved web snapshots extract the following:

- Title
- First author
- Number of citations
- Zotero key
- Publication year

In [91]:
import os
import re
import csv
from os.path import join, basename, splitext
from glob import glob
from zipfile import ZipFile

from tqdm import tqdm_notebook
from bs4 import BeautifulSoup
from pyzotero import zotero

from lib.secrets import WEB_SNAPSHOTS, USER_KEY

In [14]:
output_dir = 'data'

## Connect to Zotero

In [2]:
zot = zotero.Zotero(WEB_SNAPSHOTS, 'group', USER_KEY)

## Loop through Zotero Library and Save the Web Pages to a Work Directory (Zip File)

In [88]:
all_attachments = []

for collection in tqdm_notebook(zot.collections(), desc='collections'):
    print('collection', collection['data']['name'])

    attachments = [a for a
                   in zot.everything(zot.collection_items(collection['key']))
                   if a['data']['itemType'] == 'attachment']
    for attachment in attachments:
        print('attachment', attachment['key'])
        all_attachments.append(attachment)
        zot.dump(attachment['key'], '{}'.format(attachment['key']), output_dir)


collection SpOcc_Snapshots
attachment Z5ZU7N5F
attachment VSMU3Z2R
attachment SMFS3J7J
attachment SIUM545G
attachment P3ETDX3I
attachment MAHSVNFA
attachment M75KRCAI
attachment M2CPVBSQ
attachment KV83JQZ2
attachment KQ447228
attachment KPU2APMG
attachment G2UFHBEK
attachment D7W285IT
attachment AGVNHG55
attachment 9C3GJCZK
attachment 3CHHTUSC
attachment 27TGU5AR
attachment XSCFI2WA
attachment VUFPAHSW
attachment R5SRAIGF
attachment QPP4BKJX
attachment PAKZ7VIV
attachment MF9CKJG4
attachment I3W2KAR7
attachment GHW5WSS5
attachment GD82VPCV
attachment EV98SK8K
attachment ANKBTNPZ
attachment 9QQ29HV3
attachment 9MBH562A
attachment 7R3AUQ62
attachment 6ETJ5BMA
attachment 327TESFX
attachment 25EN6669
collection NHCDB_Snapshots
attachment ZVZAJQ2N
attachment WSVEABJT
attachment V6CKIJTJ
attachment V49KC3HH
attachment TRVPBIMS
attachment SWUKJGGF
attachment SVFMZP7P
attachment RJWDQ5IF
attachment R4NEB9GT
attachment NF7WR9XC
attachment H7S9KE5W
attachment GD3RAFTK
attachment DWKH7CVV
attach

## Extract the HTML Web Pages from the Zip Files

In [17]:
pattern = join(output_dir, '*.zip')
target = 'scholar.html'
src = join(output_dir, target)

zip_files = glob(pattern)
for zip_file in tqdm_notebook(zip_files, desc='zip files'):

    with ZipFile(zip_file) as zippy:
        name_list = zippy.namelist()

        if target not in name_list:
            print('{} not found'.format(target))
            continue

        zippy.extract(target, output_dir)

        base_name = basename(zip_file)
        file_name = splitext(base_name)[0]
        dst = join(output_dir, '{}.html'.format(file_name))

        os.rename(src, dst)

data/QKIPKFM5.zip
data/5TDE3KEU.zip
data/8QNRCN4J.zip
data/7R3AUQ62.zip
data/RBX6MDVH.zip
data/TMD7W8DQ.zip
data/A2QDS3PA.zip
data/RBNACV5F.zip
data/B326CNXI.zip
data/KQ447228.zip
data/JCB9D8IK.zip
data/C3R3BHU3.zip
data/3CHHTUSC.zip
data/MPGKUPDM.zip
data/2USU6J6P.zip
data/FFJUS9P6.zip
data/ETGRIWGK.zip
data/2EWIVKJ3.zip
data/2FK25KKG.zip
data/WKPH7E9W.zip
data/9DKKS5H4.zip
data/AGVNHG55.zip
data/6FZ63BWS.zip
data/4QU233GB.zip
data/Q2DD2HUW.zip
data/7VW35BNI.zip
data/3BXED4PX.zip
data/QPP4BKJX.zip
data/XR3N8BP2.zip
data/WA9K7JQ3.zip
data/SHJEPKSX.zip
data/HV6JB3D4.zip
data/FSJ2QBK5.zip
data/FX6B6P4C.zip
data/ANKBTNPZ.zip
data/WSVEABJT.zip
data/V4H92FFH.zip
data/KV83JQZ2.zip
data/BHSUX8XK.zip
data/R4NEB9GT.zip
data/Z5ZU7N5F.zip
data/B5Z2VQ99.zip
data/MQ8EHZZQ.zip
data/G2UFHBEK.zip
data/WUUMRQ73.zip
data/JDI9EUUU.zip
data/FGUDJGWJ.zip
data/M6AFEGV9.zip
data/TRVPBIMS.zip
data/K9WMR6RF.zip
data/P6VJXI2N.zip
data/9BNZSIXI.zip
data/9QQ29HV3.zip
data/6ETJ5BMA.zip
data/EZ6D4BJH.zip
data/ZVZAJ

## Extract Data from HTML Pages

In [84]:
pattern = join(output_dir, '*.html')
html_files = glob(pattern)

all_docs = []

for html_file in tqdm_notebook(html_files, desc='html files'):

    print(html_file)

    with open(html_file) as in_file:
        page = in_file.read()

    soup = BeautifulSoup(page, 'html.parser')

    base_name = basename(html_file)
    key = splitext(base_name)[0]
    
    docs = []

    for result in soup.select('div.gs_r'):

        # Title
        title_obj = result.select_one('.gs_rt a')
        if not title_obj:
            continue

        title = title_obj.get_text()

        # This contains several fields which are extracted later
        author_string = result.select_one('.gs_a').get_text()

        # First author
        authors = author_string.split('-')[0]
        first_author = authors.split(',')[0].strip()

        # Publication year
        match = re.search('\d{4}', author_string)
        publication_year = match.group(0) if match else ''

        # Citations
        citation_string = result.find(text=re.compile(r'Cited by \d+'))
        citations = 0
        if citation_string:
            match = re.search('\d+', citation_string)
            citations = int(match.group(0)) if match else '0'

        docs.append({
            'key': key,
            'title': title,
            'first_author': first_author,
            'publication_year': publication_year,
            'first_author': first_author,
            'citations': citations})

data/H63QF7WN.html
data/TMD7W8DQ.html
data/GZXZ4IGA.html
data/K9WMR6RF.html
data/P3ETDX3I.html
data/SWUKJGGF.html
data/UE9ZZ294.html
data/FFJUS9P6.html
data/E45V75PJ.html
data/SHJEPKSX.html
data/VUFPAHSW.html
data/GE4DICZ5.html
data/9MBH562A.html
data/HTRBPRT6.html
data/SB2SFWS6.html
data/HA8ZEAE7.html
data/F2J5PNVK.html
data/SVFMZP7P.html
data/75P9JQ6V.html
data/2EWIVKJ3.html
data/P837NSJH.html
data/AWZRBAPG.html
data/7RK4I3PD.html
data/C3R3BHU3.html
data/Z5ZU7N5F.html
data/M2CPVBSQ.html
data/M6AFEGV9.html
data/FSJ2QBK5.html
data/FGUDJGWJ.html
data/QD6P743Q.html
data/KPU2APMG.html
data/KV83JQZ2.html
data/H6D9DEGM.html
data/3P7H95NR.html
data/MAHSVNFA.html
data/WKPH7E9W.html
data/TR4GHWDW.html
data/7R3AUQ62.html
data/B5Z2VQ99.html
data/SIUM545G.html
data/KQ447228.html
data/DSAAVURJ.html
data/HJSPDAQW.html
data/BSVF5X3X.html
data/IBK695T7.html
data/VP7NZ8FF.html
data/T53N2MQH.html
data/AGVNHG55.html
data/27TGU5AR.html
data/HTVGE7BG.html
data/ANKBTNPZ.html
data/JCB9D8IK.html
data/797TZXF

## Write Results to CSV file

In [85]:
csv_path = join(output_dir, 'citations_v1.csv')

with open(csv_path, 'w') as csv_file:
    field_names = docs[0].keys()

    writer = csv.DictWriter(csv_file, field_names)
    writer.writeheader()
    writer.writerows(docs)

In [89]:
for attachment in all_attachments:
    print(attachment['key'], attachment['data']['url'])

Z5ZU7N5F https://scholar.google.com/scholar?start=180&q=%22species+occurrence%22+database&hl=en&as_sdt=1,14&as_ylo=2010&as_yhi=2017&as_vis=1
VSMU3Z2R https://scholar.google.com/scholar?start=20&q=%22species+occurrence%22+database&hl=en&as_sdt=1,14&as_ylo=2010&as_yhi=2017&as_vis=1
SMFS3J7J https://scholar.google.com/scholar?start=160&q=%22species+occurrence%22+database&hl=en&as_sdt=1,14&as_ylo=2010&as_yhi=2017&as_vis=1
SIUM545G https://scholar.google.com/scholar?start=540&q=%22species+occurrence%22+database&hl=en&as_sdt=1,14&as_ylo=2010&as_yhi=2017&as_vis=1
P3ETDX3I https://scholar.google.com/scholar?start=100&q=%22species+occurrence%22+database&hl=en&as_sdt=1,14&as_ylo=2010&as_yhi=2017&as_vis=1
MAHSVNFA https://scholar.google.com/scholar?start=60&q=%22species+occurrence%22+database&hl=en&as_sdt=1,14&as_ylo=2010&as_yhi=2017&as_vis=1
M75KRCAI https://scholar.google.com/scholar?start=140&q=%22species+occurrence%22+database&hl=en&as_sdt=1,14&as_ylo=2010&as_yhi=2017&as_vis=1
M2CPVBSQ https:

In [90]:
print(len(all_attachments))

183
