# DB Fixes

Back up your database first!

After inserting entries in database, some minor fixes to clean up data.
- check which images have no matching metadata
- go through all the OAI XML files to find this data
- add it into the database

NB: could be bugs in this and might require manual setting up depending on dataset and metadata

In [None]:
import os
import xml.etree.ElementTree as ET
import sqlite3

Connect to database

In [None]:
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"
db = sqlite3.connect(db_path)
c = db.cursor()

Check that we can access the table pragma info

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("metadata"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

In [None]:
c.execute('PRAGMA TABLE_INFO({})'.format("images"))
info = c.fetchall()

print("\nColumn Info:\nID, Name, Type, NotNull, DefaultVal, PrimaryKey")
for col in info:
    print(col)

Find where there is no matching metadata for a given image by comparing identifiers. Print and also keep this list in the `rows` array for next step.

In [None]:
c.execute('''
    SELECT count(images.identifier), images.identifier
    FROM images
    LEFT JOIN metadata ON metadata.identifier = images.identifier
    WHERE metadata.identifier IS NULL
    GROUP BY images.identifier
    ''')
rows = c.fetchall()
for row in rows:
    print(row)

In [None]:
print(sum(row[0] for row in rows))

There should only be a few entries that don't have metadata. In our case there were only 4.

Next step is to download these entries manually using web requests. See https://arxiv.org/help/oa

Example web url: http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXiv
(replace with your identifier and metadata prefix)

After downloading these, we can check if we've got all matching files:

In [None]:
os.chdir("/home/rte/arXiv/oai/")

for filename in (row[1] for row in rows):

    if os.path.isfile("oai:arXiv.org:" + filename + ".arXiv.xml") is False:
        print(filename)
#         print("no matching file")

Some manual cleaning of specific database entries. Create a list with the identifiers that we need, parse through the XML to get the required details, then insert these into the database.

In [None]:
filenames = ["math%2F0402448", "math%2F0609138", "nucl-th%2F0501065"]
# "acc-phys9607002"    

In [None]:
os.chdir("/home/rte/arXiv/oai/")
# os.chdir("/home/rte/data/extra-metadata")

OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"

# filenames = ["0804.3168"]

# for filename in (row[1] for row in rows):
for filename in filenames:
    filepath = "oai:arXiv.org:" + filename + ".arXiv.xml"
#     filepath = filename

    if os.path.isfile(filepath) is False:
        print(filename + " does not exist at this lcoation")
    else:
        data = ET.parse(filepath)
        print("opening file: " + filepath)
        root = data.getroot()    
        
        try:  
            id_node = root.find(ARXIV+'id')
#             print(id_node)
            identifier = id_node.text
#             print(identifier)
            
            date = root.find(ARXIV+'created').text
            categories = root.find(ARXIV+'categories').text
            title = root.find(ARXIV+'title').text
            abstract = root.find(ARXIV+'abstract').text
            
            license_node = root.find(ARXIV+"license")

            if license_node is not None:
                lic = license_node.text
                # print("license_node not None")
            else:
                lic = ""
            
            # attempt to get authors
            authors_list = []
            authors_element = root.find(ARXIV+"authors")
            
            # create a (string) variable to store all authors names
            # for now, this just writes the whole list of authors as a string
            anames = ""

            for author in authors_element:
                # print(author.find(ARXIV+"keyname").text)
                aname = ""
                kn = author.find(ARXIV+"keyname").text
                fn_node = author.find(ARXIV+"forenames")
                if fn_node is not None:
                    fn = fn_node.text
                else:
                    fn = ""
                aname =  kn + ", " + fn + "; "

                anames += aname

            authors_list.append(anames)
            
            # convert to string and remove extra characters
            # authors = "" + str(authors)
            authors = (str)(authors_list)[2:-4]
            
            print("-" * 20)
            print(identifier)
            print(date)
            print(categories)
            print(authors)
            print(title)
            print(abstract)
            print(lic) # don't use license as it is reserved for Python!
            print("-" * 20)
            
            c.execute("INSERT INTO metadata (identifier, created, cat, authors, title, abstract, licence) \
            VALUES (?, ?, ?, ?, ?, ?, ?)", \
            (identifier, date, categories, authors, title, abstract, lic))
            
        except KeyboardInterrupt:
            db.commit()

            # quit
            sys.exit()
        # except AttributeError as error:
            # print(error)
            # continue
        except Exception as e:
            raise e

# finally commit the changes
db.commit()
        

In [None]:
filenames = ["acc-phys9607002"]

In [None]:
# semi-manual entry for last entry
# for some reason this didn't work automatically using the block above

identifier = "acc-phys/9607002"
date = "1996-07-11"

            
c.execute("INSERT INTO metadata (identifier, created, cat, authors, title, abstract, licence) \
VALUES (?, ?, ?, ?, ?, ?, ?)", \
(identifier, date, categories, authors, title, abstract, lic))
# , authors, title, abstract, licence
# , '{3}', '{4}', '{5}', '{6}'
#             c.close()

In [None]:
db.commit()

In [None]:
c.close()
db.close()