# Reading List of IR Data from Documents (CSV File list)

This section contains a modified example based on the [reading documents page](http://chemdataextractor.org/docs/reading) of the Chem Data Extractor (CDE) documentation. 

A simple csv file containing the details of a series of documents is included (articles_list_ir.csv).

Several of the functions used in the cde_read_files.py example are re-used here, the main change is the way the list of files to process is acquired (two files for reading and writting csv files are included).

In [1]:
# The line of code (LOC) below imports the document object from the CDE library 
from chemdataextractor import Document

# import library for managing files
from pathlib import Path
import sys

# A function for getting a list of files from the directory
# This will be modified to get the list from a csv file
def get_files_list (source_dir):
    i_counter = 0
    files_list = []
    for filepath in sorted(source_dir.glob('*.pdf')):
        i_counter += 1
        files_list.append(filepath)
    return files_list

# A function for getting a list of unique occurrecnces 
# returns an array of element names and their occurrence count
def get_uniques(cde_doc):
    uniques={}
    for chement in cde_doc.cems:
        if not chement.text in uniques:
            uniques[chement.text] = 1
        else:
            uniques[chement.text] += 1
    return uniques

# A function for getting the entity with most occurrecnces  
# retuns two values: the entity name and the count
def get_max(uniques):
    max_val = 0
    max_lbl = ""
    for chement in uniques:
        if uniques[chement] > max_val:
            max_val = uniques[chement]
            max_lbl = chement.replace('\n',' ')
    return max_lbl, max_val

# A function which read a list of files from directory
# and performs a basic analysis of the documents looking
# for the most mentioned entity
def cde_read_pdfs(pdf_path = "./pdfs"):
    pdf_dir= Path(pdf_path)
    files_list = get_files_list(pdf_dir)
    print(files_list)
    for a_file in files_list:
        file_name = a_file.name
        pdf_f = open(a_file, 'rb')
        doc = Document.from_file(pdf_f)
        uniques = get_uniques(doc)
        max_lbl, max_val = get_max(uniques)       
        print(file_name, "Unique entities:", len(uniques), "Most common entity:", max_lbl, max_val)
        
# import library for managing csv files
import csv

# get the data from the csv_file, assuming first column is integer id
def get_csv_data(input_file, id_field):
    csv_data = {}
    fieldnames=[]
    with open(input_file, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if fieldnames==[]:
                fieldnames=list(row.keys())
            csv_data[int(row[id_field])]=row
    return csv_data, fieldnames

# writes data to the given file name
def write_csv_data(values, filename):
    fieldnames = []
    for item in values.keys():
        for key in values[item].keys():
            if not key in fieldnames:
                fieldnames.append(key)
    #write back to a new csv file
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for key in values.keys():
            writer.writerow(values[key])

The use of the function for reading articles from the csv file is shown below. The get_csv_data returns two values, a structure with the contents of the file and a simple list of the column headers.

In [2]:
articles_list, column_names = get_csv_data("./articles_list_ir.csv", "id")
print("The first article in the list:an\n\t", articles_list[1])
print("The names of the columns in the file:\n\t", column_names)

The first article in the list:an
	 OrderedDict([('id', '1'), ('filename', 'IRSpectroscopy/1-s2.0-S0920586114004441-main.pdf'), ('title', 'On the nature of the active Au species: CO oxidation on cyanideleached Au/TiO2catalysts'), ('doi', '10.1016/j.cattod.2014.06.021'), ('url', '')])
The names of the columns in the file:
	 ['id', 'filename', 'title', 'doi', 'url']


Modified version reading from the csv file

In [3]:
# A function which read a list of files from a csv file
# and performs a basic analysis of the documents looking
# for the most mentioned entity
# modified version of the one which reads from directory
def cde_read_pdfs_csv(csv_name = "./articles_list_ir.csv"):
    articles_list, column_names = get_csv_data(csv_name, "id")
    for a_file in articles_list:
        file_name = articles_list[a_file]['filename']
        file_title = articles_list[a_file]['title']
        pdf_f = open(file_name, 'rb')
        doc = Document.from_file(pdf_f)
        uniques = get_uniques(doc)
        max_lbl, max_val = get_max(uniques)       
        print(file_title, "Unique entities:", len(uniques), "Most common entity:", max_lbl, max_val)

In [4]:
cde_read_pdfs_csv("./articles_list_ir.csv")

On the nature of the active Au species: CO oxidation on cyanideleached Au/TiO2catalysts Unique entities: 150 Most common entity: Au 172
The effect of reaction conditions on the stability of Au/CeZrO4 catalysts in the low-temperature water–gas shift reaction Unique entities: 74 Most common entity: Au 126
Insights into the Activation Effect of H2 Pretreatment on Ag/Al2O3 Catalyst for the Selective Oxidation of Ammonia Unique entities: 125 Most common entity: Ag 186


Now we can modify the function to return serialised data, which may contain IR data or other types of data

In [5]:
# A function which read a list of files from a csv file
# and performs extract strucutured data using the
# CDE serialisation functions
def cde_read_data(csv_name = "./articles_list_ir.csv"):
    articles_list, column_names = get_csv_data(csv_name, "id")
    data_list = {}
    for a_file in articles_list:
        file_name = articles_list[a_file]['filename']
        file_title = articles_list[a_file]['title']
        pdf_f = open(file_name, 'rb')
        doc = Document.from_file(pdf_f)
        serialised=doc.records.serialize()
        data_list[file_title] = serialised
    return data_list

In [7]:
for art_title in data_sets:
    print(art_title)
    for element in data_sets[art_title]:
        print(element)

On the nature of the active Au species: CO oxidation on cyanideleached Au/TiO2catalysts
{'names': ['c o m / l o c a t e / c a t t o d']}
{'names': ['[ 1–3 ]']}
{'names': ['[ 4–6 ]']}
{'names': ['[ 9–11 ]']}
{'names': ['[ 4–6,9,16–23 ]']}
{'names': ['Max-Planck-Gesellschaft']}
{'names': ['Fe2O3']}
{'names': ['Au(OH)3']}
{'names': ['methanol']}
{'names': ['{ 1 1 0 }']}
{'names': ['{ 1 0 0 }']}
{'names': ['Au1+']}
{'names': ['CO oxi-']}
{'names': ['oxide']}
{'names': ['HAuCl4·3H2O']}
{'names': ['NaOH']}
{'names': ['NaCN']}
{'names': ['NiCr']}
{'names': ['Ni']}
{'names': ['H2 5.0']}
{'names': ['H2']}
{'names': ['Na']}
{'names': ['h CO oxid']}
{'names': ['CN- leachi']}
{'names': ['cat 1']}
{'names': ['O(1)s']}
{'names': ['Au(4f) region']}
{'names': ['cat 1-O4']}
{'names': ['cat 1a-O4']}
{'names': ['1 1a 1a 1a 1b 1b 2 2 Support']}
{'names': ['N o t']}
{'names': ['n n o i t i d n o c e r']}
{'names': ['r e t f a y t i v i t c a']}
{'names': ['/ n m 0 0 0 1 r e t f a']}
{'names': ['4 − 0 1 / n

None of the date elements appears to be IR data or other kinds of structured data