In [64]:
import os
from glob import glob
import pandas as pd
import json
import re
from pathlib import Path

General data structure plan here is to create a dict for each paper, each key of which contains some sort of information about the paper. So, we have the standard metadata (DOI, date, title etc), and also possibly counts of certain regex matches, with keys for the regexes (or nicer names).

I'm not sure how to approach it when we need to store information about each individual match...

In [33]:
def get_article_metadata(folder_path):
    json_filename = str(folder_path / 'results.json')
    
    with open(json_filename) as f:
        j = json.load(f)
        
    selected_keys = ['doi', 'title', 'date', 'journal']
        
    filtered_dict = { k: j[k]['value'][0] for k in selected_keys }
    
    return filtered_dict

In [72]:
def count_regex_matches(pattern, fname, flags=0):
    with open(fname) as f:
        text = f.read()
        #print(text)
        
        count = len(re.findall(pattern, text, flags=flags))
        
    return count

In [100]:
def count_multiple_regexes(regexes, fname, flags=0):
    res = {}
    
    for item in regexes:
        try:
            regex, name = item
        except (TypeError, ValueError):
            regex = item
            name = item
        res[name] = count_regex_matches(regex, fname, flags=0)
        
    return res

In [109]:
def process_article(folder):
    # If not a valid CM folder then return
    if not (folder / 'results.json').exists():
        return None
    
    # Get the metadata first
    results = get_article_metadata(folder)
    
    # Now we can actually do the processing!
    
    # As an example we will do a simple regex and count the results
    # This works for a single regex
    #results['Landsat'] = count_regex_matches('landsat', str(folder / 'fulltext.xml'))
    
    # To do multiple regexes nice and easily we can do the following
    
    # Set up a list of regexes (and optional nice names for them)
    regexes = [('FLAASH'),
               ('ATCOR'),
               ('SMAC'),
               ('6S', '6S'),
               ('empirical line'),
               ('\bELM\b')]
    
    regex_stats = count_multiple_regexes(regexes, str(folder / 'fulltext.xml'), flags=re.IGNORECASE)
    
    results.update(regex_stats)
    
    
    
    return results

In [119]:
p = Path()
folders = p.glob('mdpi-rs/**/')

results = [process_article(folder) for folder in folders]

# Filter out the None's...there must be a better way to do this!
results = filter(None, results)
results = pd.DataFrame(list(results))
results.date = pd.to_datetime(results.date)

In [127]:
results['month'] = pd.DatetimeIndex(results.date).month

In [133]:
results.groupby('month').apply(lambda x: (x > 0).sum())

Unnamed: 0_level_0,ELM,6S,ATCOR,FLAASH,SMAC,date,doi,empirical line,journal,title,month
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,0,1,2,1,0,39,39,0,39,39,39
3,0,2,1,7,1,60,60,1,60,60,60


In [131]:
(results.FLAASH > 0).sum()

8