Script to create a new database with only a single table for performing full text search.

In [None]:
import sqlite3
import os
import time
import json
import numpy as np
import time

In [None]:
db1_path = "/home/rte/data/db/arxiv_db_images_600k_single.sqlite3"
db2_path = "/home/rte/data/db/arxiv_db_images_600k.sqlite3"

In [None]:
if not os.path.isfile(db1_path):
    try:
        # create a database in RAM
        # db = sqlite3.connect(':memory:')
        # creates or opens a file database
        db1 = sqlite3.connect(db1_path)

        # get cursor object and create metadata table
        c1 = db1.cursor()
        
        
        # create single (images) table
#         c1.execute('''
#             CREATE TABLE single (id INTEGER PRIMARY KEY, identifier TEXT, filename TEXT, \
#             filesize INT, path TEXT, x INT, y INT, imageformat TEXT, creator TEXT, vggpred TEXT, \
#             created TEXT, cat TEXT, authors TEXT, title TEXT, abstract TEXT, licence TEXT, \
#             tex TEXT, fignum TEXT, caption TEXT, label TEXT, filenames TEXT, image_ids TEXT)
#         ''')

        c1.execute('''
            CREATE TABLE single (id INTEGER PRIMARY KEY, identifier TEXT, filename TEXT, \
            x INT, y INT, imageformat TEXT, creator TEXT, vggpred TEXT, \
            created TEXT, cat TEXT, authors TEXT, title TEXT, abstract TEXT, \
            caption TEXT, label TEXT)
        ''')

        db1.commit()

    except Exception as e:
        # Roll back any change if something goes wrong
        db1.rollback()
        raise e
    finally:
        # Close the db connection
        db1.close()
        print("database file created")
else:
    print("database file already exists")

In [None]:
# Here we import the sqlite3 database and create a cursor
db1 = sqlite3.connect(db1_path)
c1 = db1.cursor()

In [None]:
# attach original database
attach_sql = "ATTACH DATABASE ? AS y"
c1.execute(attach_sql, (db2_path, ))

In [None]:
# test by getting one row
c1.execute('''
    SELECT *
    FROM single
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c1.fetchall()
for row in rows:
    print(row)

In [None]:
insert_select_sql = ('''
    INSERT or REPLACE INTO single
    SELECT images.id, images.identifier, images.filename,
    images.x, images.y, images.imageformat, images.creator, 
    images.vggpred,
    metadata.created, metadata.cat, metadata.authors, metadata.title, metadata.abstract, 
    captions.caption, captions.label
    FROM y.images
    LEFT JOIN y.metadata ON y.images.identifier == y.metadata.identifier
    LEFT JOIN y.captions ON y.images.caption == y.captions.id
    ''')

c1.execute(insert_select_sql, )

In [None]:
# create virtual table
vtable_sql = "CREATE VIRTUAL TABLE vsingle USING \
                fts5(id, identifier, filename, creator, vggpred, \
                cat, authors, title, abstract, caption, label)"

c1.execute(vtable_sql, )

In [None]:
# insert rows into virtual table
vtable_sql = '''
                INSERT INTO vsingle
                SELECT id, identifier, filename, creator, vggpred,
                cat, authors, title, abstract, caption, label
                FROM single
                '''

c1.execute(vtable_sql, )

In [None]:
vsearch_sql = "SELECT * FROM vsingle LIMIT 1"

c1.execute(vsearch_sql, )
rows = c1.fetchall()
for row in rows:
    print(row)

In [None]:
vsearch_sql = "SELECT identifier FROM vsingle WHERE vsingle MATCH ?"
fts = "smith"
c1.execute(vsearch_sql, (fts, ))
rows = c1.fetchall()
print(len(rows))
for row in rows:
    print(row)

In [None]:
db1.commit()

In [None]:
db1.close()

Fix categories where `/` had been removed

This isn't a good way to do it, but was quick for a small number of entries

In [None]:
sql_get_id = '''
    SELECT id, identifier
    FROM single
    '''
c1.execute(sql_get_id, )
rows = c1.fetchall()
print(len(rows))

In [None]:
import re
updated_data = []
for i, ii in rows:
    updated_data.append((i, re.sub(r'(^\D*)(\d{7})', r'\1/\2', ii)))

In [None]:
for r in updated_data[:100]:
    print(r)

In [None]:
sql_add_slash = '''
    UPDATE single
    SET identifier = ?
    WHERE id = ?
    '''
for r in updated_data:
    c1.execute(sql_add_slash, (r[1], r[0]))

In [None]:
db1.commit()
db1.close()

Attempt 2 - just creating virtual table on same database

UNTESTED

In [None]:
db2 = sqlite3.connect(db2_path)
c2 = db2.cursor()

In [None]:
# test by getting one row
c2.execute('''
    SELECT * 
    FROM images 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c2.fetchall()
for row in rows:
    print(row)

In [None]:
# create virtual table
vtable_sql = '''
        CREATE VIRTUAL TABLE vsingle USING
        fts5(SELECT images.id, images.identifier, images.filename, images.filesize, images.path, 
        images.x, images.y, images.imageformat, images.creator, 
        images.vggpred,
        metadata.created, metadata.cat, metadata.authors, metadata.title, metadata.abstract, 
        metadata.licence, captions.tex, captions.fignum, captions.caption, captions.label, 
        captions.filenames, captions.image_ids
        FROM y.images
        LEFT JOIN y.metadata ON y.images.identifier == y.metadata.identifier
        LEFT JOIN y.captions ON y.images.caption == y.captions.id)
        '''
c2.execute(vtable_sql, )

In [None]:
        '''
        images.id, images.identifier, images.filename, images.path, 
        images.creator, images.vggpred,
        metadata.cat, metadata.authors, metadata.title, metadata.abstract, 
        captions.caption, captions.label
        '''

In [None]:
# insert rows into virtual table
vtable_sql = '''
                INSERT INTO vsingle
                SELECT id, identifier, filename, path, creator, vggpred,
                cat, authors, title, abstract, caption, label
                FROM single
                '''

c2.execute(vtable_sql, )

In [None]:
vsearch_sql = "SELECT identifier FROM vsingle WHERE vsingle MATCH ?"

c2.execute(vsearch_sql, ("ligeti", ))
rows = c2.fetchall()
print(len(rows))
for row in rows:
    print(row)