# Getting to Grips with JSTOR data

There is a lot of data in the zip files provided by JSTOR. How to get at it properly?

In [None]:
import os
import re
import random
import sys
import pickle as p
from math import inf

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

%matplotlib inline

In [None]:
data_dir = os.path.expanduser('~') + '/jstor-unzipped'

In [None]:
os.listdir(data_dir + '/ocr')[0:10]

In [None]:
# Have a look at a random article
test_article = random.choice(os.listdir(data_dir + '/ocr')).replace('.txt', '')

with open(os.path.join(data_dir, 'ocr', test_article + '.txt')) as file:
    test_ocr = file.read()

with open(os.path.join(data_dir, 'metadata', test_article + '.xml')) as file:
    test_xml = BeautifulSoup(file.read())

In [None]:
print(test_article)

Looking in the metadata, the most reliable id appears to be the doi. The doi is included in each file name, and appears to also be recorded in the metadata xml.

The structure of the metadata also depends on the document type, which is also recorded in the file name.

In [None]:
id_rgx = re.compile(r'(?<=-)\d+.+(?=\.txt|\.xml)')
type_rgx = re.compile(r'^\D+(?=-\d)')

In [None]:
# Let's see if that works on our test example
test_id_art = id_rgx.search(os.path.join(data_dir, 'ocr', test_article + '.txt')).group(0)
test_id_meta = id_rgx.search(os.path.join(data_dir, 'metadata', test_article + '.xml')).group(0)
if test_id_art == test_id_meta:
    print('Success!')
else:
    print('Too bad.')
print(f'test_id_art = {test_id_art}')
print(f'test_id_meta = {test_id_meta}')

In [None]:
# And the type regex
all_types = {}
errors = []
for name in os.listdir(data_dir + '/metadata'):
    try:
        doc_type = type_rgx.search(name).group(0)
        if doc_type in all_types:
            all_types[doc_type] += 1
        else:
            all_types[doc_type] = 1
    except AttributeError:
        errors.append(name)

print(f'There are {len(all_types)} types of document in the corpus:')
for dt,n in all_types.items():
    print('   - ', dt, ': ', n, sep = '')

Before we extract the metadata, let's see how many files there are:

In [None]:
print(f'Number of OCR files: {len(os.listdir(data_dir + "/ocr"))}')
print(f'Number of meta files: {len(os.listdir(data_dir + "/metadata"))}')

Weirdly it seems there are 10,000 metadata files with no corresponding text file. It seems that the full text is only available for about 400 of the books.

Let's do the easy part first: extract all the article_ids from the text files.

In [None]:
def extract_doi_from_txt(directory):
    
    id_rgx = re.compile(r'(?<=-)\d+.+(?=\.txt|\.xml)')
    type_rgx = re.compile(r'^\D+(?=-\d)')
    
    # Loops over directory of JSTOR txt files, extracts doi from filename
    ocr_ids = {}
    for name in os.listdir(directory):
        doi = id_rgx.search(name).group(0)
        ocr_ids[doi] = name
    return ocr_ids

Now let's extract the metadata from the xml files.

For the book-chapters, we will want to capture data about which book they're from, and whether it is a solo monograph or a collection. I am inclined to count a 'collection' as many texts, and a solo monograph as a single text.

Actually, now that I think about it, do I really need to count the corpus? What's the use of that information...?

It seems that altough each book-chapter is treated as a seperate object in the data, the metadata file is the metadata for the entire book.

In [None]:
# Have a look at a random book to work out how to get metadata
test_book = random.choice([name for name in os.listdir(data_dir + '/metadata') if name.startswith('book')])

with open(data_dir + '/metadata/' + test_book) as xml_file:
    test_soup = BeautifulSoup(xml_file.read())

In [None]:
# ocr_ids = extract_doi_from_txt(data_dir + '/ocr')
# corpus_meta, error = extract_jstor_meta(data_dir + '/metadata')

In [None]:
# Save and load the metadata

# with open('corpus-meta_20200329.p', 'wb') as file:
#     p.dump(corpus_meta, file)

with open('corpus-meta_20200329.p', 'rb') as file:
    corpus_meta = p.load(file)

In [None]:
# How many of each type?
all_types = {}
no_titles = 0
no_years = 0
for _,doc_dict in corpus_meta.items():
    if doc_dict['type'] not in all_types:
        all_types[doc_dict['type']] = 1
    else:
        all_types[doc_dict['type']] += 1
    if 'title' not in doc_dict:
        no_titles += 1
    if 'year' not in doc_dict:
        no_years += 1

print(f'There are {len(all_types)} kinds of document in the corpus:')
for doc_type,n in all_types.items():
    print(f'   - {doc_type}: {n}')
print(f'Of the {len(corpus_meta)} documents in the corpus, {no_titles} lack titles, and {no_years} lack dates.')

In [None]:
year_data = {}
for _,doc_dict in corpus_meta.items():
    if 'year' in doc_dict:
        year_int = int(doc_dict['year'])
        if year_int in year_data:
            year_data[year_int] += 1
        else:
            year_data[year_int] = 1

year_df = pd.DataFrame(list(year_data.items()), columns=['year','n'])
year_df.sort_values('year',inplace=True)

In [None]:
year_df[(year_df.year > 1945) & (year_df.year <= 2015)].plot(x='year', y='n')
plt.show()

## Build input pipeline for Gensim

Gensim allows you to stream data when, as in this case, you might have too much to fit in memory.

In [None]:
class JSTORCorpus:
    # Iterator for streaming articles from JSTOR DfR corpus into Gensim
    
    def __init__(self, meta_dir, data_dir, corpus_meta=None):
        self.meta_dir = meta_dir
        self.data_dir = data_dir
        self.corpus_meta = corpus_meta
        
        if self.corpus_meta is None:
            self.extract_jstor_meta(self.meta_dir, self.data_dir)
        
    def __iter__(self)
        for key in self.corpus_meta:
            with open(key) as file:
                yield file.load()
    
    def extract_jstor_meta(self, meta_dir, data_dir):
        """Loops over directory of JSTOR metadata files, extracts key info from xml

        Arguments:
        meta_dir (str): directory where metadata files are held
        data_dir (str): directory where data files are held
        """

        self.corpus_meta = {}
        
        parsed = 0
        skipped = 0

        print(f'Parsing xml files in {meta_dir}. Associated .txt in {data_dir}')
        
        # The metadata file contains many documents without a text file. We don't want that!
        actual_docs = set(os.listdir(data_dir))

        for name in tqdm(os.listdir(meta_dir)):
            
            # Infer name of data file and check
            txt_file = name[:-3] + 'txt' # replace .xml with .txt
            if txt_file not in actual_docs:
                skipped += 1
                continue

            # Locate data file
            data_file = os.pathjoin(data_dir, txt_file) # fill path
            
            # Read in metadata file
            with open(os.path.join(meta_dir, name)) as file:
                meta_xml = BeautifulSoup(file.read())

            # Get key metadata
            doc_dict = {}

            # For articles:
            if name.startswith('journal-article'):
                doc_dict['type'] = meta_xml.html.body.article['article-type']
                title = meta_xml.find(['article-title','source'])
                if title is not None:
                    doc_dict['title'] = title.get_text()
                year = meta_xml.find('year')
                if year is not None:
                    doc_dict['year'] = year.get_text()

            # For book chapters:
            elif name.startswith('book-chapter'):
                doc_dict['type'] = 'book-chapter'
                # First book-id element is id of whole book
                part_of = meta_xml.find('book-id')
                if part_of is not None:
                    doc_dict['part-of'] = part_of.get_text()
                year = meta_xml.find('year')
                if year is not None:
                    doc_dict['year'] = year.get_text()
                # Getting chapter title is slightly harder, because sometimes each book-part is labelled
                # simply with the internal id, and sometimes with the doi
                book_id = re.sub('.+_', '', doi)
                book_rgx = re.compile(re.escape(book_id))
                doc_dict['title'] = meta_xml.find('book-part-id', string=book_rgx).parent.find('title').get_text()

            # Store in corpus_meta dict
            self.corpus_meta[data_file] = doc_dict
            
            # Increment counter
            parsed += 1

        # Success message
        print(f'{parsed} documents parsed successfully. {skipped} documents skipped.')
        
    def filter_corpus_by_year(self, min_year=1750, max_year=inf):
        """Filters the corpus according to minimum and maximum years
        
        Arguments:
        min_year (int)
        max_year (int)"""
        
        filtered_corpus = {}
        
        for key,val_dict in self.corpus_meta.items():
            # Skip files that cannot be parsed
            if 'year' not in val_dict:
                continue
            try:
                year = int(val_dict['year'])
            except ValueError:
                continue
            # Apply conditions
            if year <= max_year & year >= min_year:
                filtered_corpus[key] = val_dict
        
        self.meta_corpus = filtered_corpus
        
    def filter_corpus_by_type(self, allowed_types):
        """Filters the corpus by doctype.
        
        Arguments:
        allowed_types (list): a list of strings with the allowed doc_types"""
        
        filtered_corpus = {}
        
        for key, val_dict in self.corpus_meta.items():
            if val_dict.type in allowed_types:
                filtered_corpus[key] = val_dict
                
        self.meta_corpus = filtered_corpus
        
            