# Bibliography Categorization: 'BibCat'
## Notebook: Generating datasets for the bibcat class.



---

This notebook presents the scripts used to generate base datasets used for training and testing the bibcat codebase.

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
#Import external packages
import os
import time
import json
import numpy as np

In [3]:
#Set global variables and booleans for generating databases
do_save = False #If True, will save generated databases and potentially overwrite any previous saved data
do_verbose = True #If True, will print information during database generation
do_generate_combined_json = False #If True, will generate text dataset combining raw text and classification
#
#Set filepath for output
filepath_input = os.path.join(os.path.expanduser("~"), "Documents/STScI_Fellowship/Functional/Library/BibTracking/Datasets")
filepath_output = os.path.join(os.path.expanduser("~"), "Documents/STScI_Fellowship/Functional/Library/BibTracking/scratchwork")

---

The block of code below generates dataset combining raw text with actual classifications.

In [4]:
#Set parameters specific to this dataset
if do_generate_combined_json:
    #File names
    filename_papertrack = os.path.join(filepath_input, "papertrack_export_2021-08-18.csv")
    filename_papertext = os.path.join(filepath_input, "ST_Request2021_use.json")
    filesave_json = os.path.join(filepath_output, "dataset_combined_all.json")
    filesave_notinpapertext = os.path.join(filepath_output, "bibcodes_notin_papertext.txt")
    filesave_notinpapertrack = os.path.join(filepath_output, "bibcodes_notin_papertrack.txt")
    #
    #Database keys
    keys_papertext = ['abstract', 'author', 'bibcode', 'body', 'keyword', 'keyword_norm', 'pubdate', 'title']
    keys_classes = ["bibcode", "name_search", "is_ignored", "papertype", "mission", "year_entry"]
#    

In [5]:
#Run this dataset procedure
if do_generate_combined_json:
    #Load paper texts
    with open(filename_papertext) as openfile:
        dataset_papers_orig = json.load(openfile)
    #Load paper classes
    dataset_classes_orig = np.genfromtxt(filename_papertrack, delimiter=",",
                                        skip_header=1, dtype=str)
    #Extract bibcodes from each set
    bibcodes_papers = [item["bibcode"] for item in dataset_papers_orig]
    dates_papers = [item["pubdate"] for item in dataset_papers_orig]
    min_year_papers = min(dates_papers)
    max_year_papers = max(dates_papers)
    adssearch_classes = dataset_classes_orig[:,keys_classes.index("name_search")]
    bibcodes_classes = dataset_classes_orig[:,keys_classes.index("bibcode")]
    missions_classes = dataset_classes_orig[:,keys_classes.index("mission")]
    papertypes_classes = dataset_classes_orig[:,keys_classes.index("papertype")]
    is_falsepos_classes = dataset_classes_orig[:,
                                        keys_classes.index("is_ignored")]

    #Print some notes
    if do_verbose:
        print("Min. date of papers within text database: {0}."
                .format(min_year_papers))
        print("Max. date of papers within text database: {0}."
                .format(max_year_papers))
        print("")
    #

    #Trim papertrack dictionary down to only columns to include
    try:
        storage = [{key:value for key,value in thisdict.items()
                                if (key in keys_papertext)}
                                for thisdict in dataset_papers_orig]
    except AttributeError: #If this error raised, probably earlier Python vers.
        storage = [{key:value for key,value in thisdict.iteritems()
                                if (key in keys_papertext)}
                                for thisdict in dataset_papers_orig]
    #

    #Verify that all papers within papertrack are within the papertext database
    bibcodes_notin_papertext = [val for val in np.unique(bibcodes_classes)
                                if (val not in bibcodes_papers)]
    if (len(bibcodes_notin_papertext) > 0):
        errstr = ("Note! Papers in papertrack not in text database!"
                    +"\n{0}\n{1} of {2} in all.\n"
                    .format(bibcodes_notin_papertext, len(bibcodes_notin_papertext),
                            len(bibcodes_papers)))
        #raise ValueError(errstr)
        print(errstr)
    #

    #Iterate through paper dictionary
    num_notin_papertrack = 0
    bibcodes_notin_papertrack = []
    for ii in range(0, len(storage)):
        #Extract information for current paper within text database
        curr_dict = storage[ii] #Current dictionary
        curr_bibcode = curr_dict["bibcode"]

        #Extract index for current paper within papertrack
        curr_inds = [jj for jj,x in enumerate(bibcodes_classes)
                        if (x == curr_bibcode)]
        if len(curr_inds) == 0:
            print("Bibcode ({0}, {1}) not in papertrack database. Continuing..."
                    .format(ii, curr_bibcode))
            bibcodes_notin_papertrack.append(curr_bibcode)
            num_notin_papertrack += 1
            continue
        #

        #Copy over data from papertrack into text database
        curr_dict["class_missions"] = {}
        inner_dict = {}
        for jj in range(0, len(curr_inds)):
            #Prepare inner dictionary for current mission
            curr_dict["class_missions"][missions_classes[
                                                    curr_inds[jj]]] = inner_dict
            #Store papertrack information into the inner dictionary
            inner_dict["bibcode"] = bibcodes_classes[curr_inds[jj]]
            inner_dict["papertype"] = papertypes_classes[curr_inds[jj]]
            #
            tmp_falsepos = is_falsepos_classes[curr_inds[jj]]
            tmp_bool = None
            if tmp_falsepos in ["True", "TRUE"]:
                tmp_bool = True
            elif tmp_falsepos in ["False", "FALSE"]:
                tmp_bool = False
            else:
                raise ValueError("Whoa! Diff. false-pos flag {0} at {1}!\n{2}"
                    .format(tmp_falsepos, curr_inds[jj]))
            #
            inner_dict["is_ignored_{0}".format(adssearch_classes[curr_inds[jj]])
                        ] = tmp_bool #Store for specific ADS search
    #

    #Print some notes
    if do_verbose:
        print("Done generating dictionaries of combined papertrack+text data.")
        print("NOTE: {0} papers in text data that were not in papertrack."
                .format(num_notin_papertrack))
    #

    #Save the file, if so desired, and exit the function
    if do_save:
        #Save the combined dataset
        with open(filesave_json, 'w') as openfile:
            json.dump(storage, openfile, indent=4)
        #Also save the papertrack classifications not found in papertext
        np.savetxt(filesave_notinpapertext,
                    np.asarray(bibcodes_notin_papertext).astype(str),
                    delimiter="\n", fmt='%s')
        #Also save the paper-texts not found in papertrack
        np.savetxt(filesave_notinpapertrack,
                    np.asarray(bibcodes_notin_papertrack).astype(str),
                    delimiter="\n", fmt='%s')
        #
        #Print some notes
        if do_verbose:
            print("Combined .json file saved to:\n{0}".format(filesave_json))
            print("Bibcodes not in papertext saved to:\n{0}".format(filesave_notinpapertext))
            print("Bibcodes not in papertrack saved to:\n{0}".format(filesave_notinpapertrack))
        #
    #           
#

Min. date of papers within text database: 2018-00-00.
Max. date of papers within text database: 2021-12-00.

Note! Papers in papertrack not in text database!
['1981AdSpR...1d.211L', '1981AdSpR...1h.201E', '1981AdSpR...1j..71N', '1981AdSpR...1k.169R', '1981AdSpR...1n..55H', '1982AdSpR...2d..49O', '1982AdSpR...2d.143C', '1982AdSpR...2d.157B', '1982AdSpR...2e.147K', '1982AdSpR...2e.157T', '1982AdSpR...2f..25R', '1982AdSpR...2h..51O', '1982AdSpR...2h.169K', '1982AdSpR...2h.201H', '1982AdSpR...2i.189P', '1982AdSpR...2i.293B', '1982AdSpR...2j.149L', '1982AdSpR...2j.283N', '1982AdSpR...2l.219S', '1982AdSpR...2l.259M', '1983AdSpR...3a..35R', '1983AdSpR...3f..29H', '1983AdSpR...3g..35D', '1983AdSpR...3g..45E', '1983AdSpR...3g..53M', '1983AdSpR...3h..55O', '1983AdSpR...3h..79H', '1983AdSpR...3h..85F', '1983AdSpR...3i..39B', '1983AdSpR...3i..47M', '1984AdSpR...3j.485A', '1984AdSpR...3j.491S', '1984AdSpR...3j.501B', '1984AdSpR...4d..41T', '1984AdSpR...4i.297M', '1984AdSpR...4j..47G', '1984AdSpR...

---

In [6]:
#Set end marker for this tutorial.
print("This tutorial completed successfully.")

This tutorial completed successfully.
