Script to create a new database with only a single table for performing full text search.

In [20]:
import sqlite3
import os
import time
import json
import numpy as np
import time

In [21]:
db1_path = "/home/rte/data/db/arxiv_db_images_600k_single.sqlite3"
db2_path = "/home/rte/data/db/arxiv_db_images_600k.sqlite3"

In [22]:
if not os.path.isfile(db1_path):
    try:
        # create a database in RAM
        # db = sqlite3.connect(':memory:')
        # creates or opens a file database
        db1 = sqlite3.connect(db1_path)

        # get cursor object and create metadata table
        c1 = db1.cursor()
        
        
        # create single (images) table
#         c1.execute('''
#             CREATE TABLE single (id INTEGER PRIMARY KEY, identifier TEXT, filename TEXT, \
#             filesize INT, path TEXT, x INT, y INT, imageformat TEXT, creator TEXT, vggpred TEXT, \
#             created TEXT, cat TEXT, authors TEXT, title TEXT, abstract TEXT, licence TEXT, \
#             tex TEXT, fignum TEXT, caption TEXT, label TEXT, filenames TEXT, image_ids TEXT)
#         ''')

        c1.execute('''
            CREATE TABLE single (id INTEGER PRIMARY KEY, identifier TEXT, filename TEXT, \
            x INT, y INT, imageformat TEXT, creator TEXT, vggpred TEXT, \
            created TEXT, cat TEXT, authors TEXT, title TEXT, abstract TEXT, \
            caption TEXT, label TEXT)
        ''')

        db1.commit()

    except Exception as e:
        # Roll back any change if something goes wrong
        db1.rollback()
        raise e
    finally:
        # Close the db connection
        db1.close()
        print("database file created")
else:
    print("database file already exists")

database file created


In [23]:
# Here we import the sqlite3 database and create a cursor
db1 = sqlite3.connect(db1_path)
c1 = db1.cursor()

In [24]:
# attach original database
attach_sql = "ATTACH DATABASE ? AS y"
c1.execute(attach_sql, (db2_path, ))

<sqlite3.Cursor at 0x7fe2148e3e30>

In [25]:
# test by getting one row
c1.execute('''
    SELECT * 
    FROM images 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c1.fetchall()
for row in rows:
    print(row)

(80, 'cond-mat0606359', 'fig9.eps', 16939, './0606/cond-mat0606359', 423, 634, 'PS', 'wktse@leblon with xmgr', 'slide_rule 0.1208,menu 0.1112,oscilloscope 0.1074,web_site 0.0858,rule 0.0849', 76)


In [26]:
insert_select_sql = ('''
    INSERT or REPLACE INTO single
    SELECT images.id, images.identifier, images.filename,
    images.x, images.y, images.imageformat, images.creator, 
    images.vggpred,
    metadata.created, metadata.cat, metadata.authors, metadata.title, metadata.abstract, 
    captions.caption, captions.label
    FROM y.images
    LEFT JOIN y.metadata ON y.images.identifier == y.metadata.identifier
    LEFT JOIN y.captions ON y.images.caption == y.captions.id
    ''')

c1.execute(insert_select_sql, )

<sqlite3.Cursor at 0x7fe2148e3e30>

In [27]:
# create virtual table
vtable_sql = "CREATE VIRTUAL TABLE vsingle USING \
                fts5(id, identifier, filename, creator, vggpred, \
                cat, authors, title, abstract, caption, label)"

c1.execute(vtable_sql, )

<sqlite3.Cursor at 0x7fe2148e3e30>

In [28]:
# insert rows into virtual table
vtable_sql = '''
                INSERT INTO vsingle
                SELECT id, identifier, filename, creator, vggpred,
                cat, authors, title, abstract, caption, label
                FROM single
                '''

c1.execute(vtable_sql, )

<sqlite3.Cursor at 0x7fe2148e3e30>

In [29]:
vsearch_sql = "SELECT * FROM vsingle LIMIT 1"

c1.execute(vsearch_sql, )
rows = c1.fetchall()
for row in rows:
    print(row)

(80, 'cond-mat0606359', 'fig9.eps', 'wktse@leblon with xmgr', 'slide_rule 0.1208,menu 0.1112,oscilloscope 0.1074,web_site 0.0858,rule 0.0849', 'cond-mat.soft', 'Tse, Wang-Kong; Leung, P. T.', 'Theory of Light Emission in Sonoluminescence as Thermal Radiation', "  Based on the model proposed by Hilgenfeldt {\\it at al.} [Nature {\\bf 398},\n401 (1999)], we present here a comprehensive theory of thermal radiation in\nsingle-bubble sonoluminescence (SBSL). We first invoke the generalized\nKirchhoff's law to obtain the thermal emissivity from the absorption\ncross-section of a multilayered sphere (MLS). A sonoluminescing bubble, whose\ninternal structure is determined from hydrodynamic simulations, is then\nmodelled as a MLS and in turn the thermal radiation is evaluated. Numerical\nresults obtained from simulations for argon bubbles show that our theory\nsuccessfully captures the major features observed in SBSL experiments.\n", '\\small FWHM obtained from UBM+WOM versus wavelength\n$\\lam

In [30]:
vsearch_sql = "SELECT identifier FROM vsingle WHERE vsingle MATCH ?"
fts = "smith"
c1.execute(vsearch_sql, (fts, ))
rows = c1.fetchall()
print(len(rows))
for row in rows:
    print(row)

7095
('cond-mat0606309',)
('astro-ph0606633',)
('astro-ph0606175',)
('astro-ph0606175',)
('astro-ph0606175',)
('math0606061',)
('astro-ph0606702',)
('astro-ph0606110',)
('astro-ph0606110',)
('astro-ph0606321',)
('astro-ph0606321',)
('astro-ph0606211',)
('astro-ph0606211',)
('astro-ph0606092',)
('astro-ph0606300',)
('astro-ph0606144',)
('hep-ph0606081',)
('cs0606008',)
('physics0109016',)
('astro-ph0109087',)
('astro-ph0109488',)
('astro-ph0109482',)
('astro-ph0109189',)
('astro-ph0512108',)
('cond-mat0512104',)
('cs0512069',)
('cs0512069',)
('cs0512069',)
('cs0512069',)
('astro-ph0512623',)
('astro-ph0512183',)
('nucl-ex0512006',)
('1111.0302',)
('1111.6186',)
('1111.3119',)
('1111.2210',)
('1111.0103',)
('1111.1100',)
('1111.0691',)
('1111.0691',)
('1111.4439',)
('1111.5731',)
('1111.5731',)
('1111.5731',)
('1111.5731',)
('1111.0521',)
('1111.2133',)
('1111.6398',)
('1111.0560',)
('1111.4427',)
('1111.5028',)
('1111.0004',)
('1111.5031',)
('1111.3886',)
('1111.4985',)
('1111.4985',)
(

('astro-ph0404393',)
('astro-ph0404393',)
('astro-ph0404393',)
('1511.04849',)
('1511.01220',)
('1511.01718',)
('1511.03078',)
('1511.08686',)
('1511.06167',)
('1511.06167',)
('1511.06951',)
('1511.08039',)
('1511.08039',)
('1511.01682',)
('1511.01032',)
('1511.05574',)
('1511.00934',)
('1511.06930',)
('1511.08812',)
('1511.08812',)
('1511.08812',)
('1511.08812',)
('1511.00941',)
('1511.02872',)
('1511.05855',)
('1511.04590',)
('1511.05181',)
('1511.05181',)
('1511.02233',)
('1511.02233',)
('0812.1120',)
('0812.1120',)
('0812.0978',)
('0812.3103',)
('0812.3733',)
('0812.0023',)
('0812.3455',)
('0812.4543',)
('0812.2943',)
('0812.1566',)
('0812.1566',)
('0812.1253',)
('0812.0870',)
('0812.1150',)
('0812.1150',)
('0812.1150',)
('astro-ph0310313',)
('astro-ph0310843',)
('cond-mat0310669',)
('cond-mat0310669',)
('0905.2933',)
('0905.1328',)
('0905.1328',)
('0905.1328',)
('0905.1058',)
('0905.3175',)
('0905.3030',)
('0905.3654',)
('0905.4952',)
('0905.2044',)
('0905.3746',)
('0905.3746',)
(

In [31]:
db1.commit()

In [32]:
db1.close()

Attempt 2 - just creating virtual table on same database

UNTESTED

In [None]:
db2 = sqlite3.connect(db2_path)
c2 = db2.cursor()

In [None]:
# test by getting one row
c2.execute('''
    SELECT * 
    FROM images 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c2.fetchall()
for row in rows:
    print(row)

In [None]:
# create virtual table
vtable_sql = '''
        CREATE VIRTUAL TABLE vsingle USING
        fts5(SELECT images.id, images.identifier, images.filename, images.filesize, images.path, 
        images.x, images.y, images.imageformat, images.creator, 
        images.vggpred,
        metadata.created, metadata.cat, metadata.authors, metadata.title, metadata.abstract, 
        metadata.licence, captions.tex, captions.fignum, captions.caption, captions.label, 
        captions.filenames, captions.image_ids
        FROM y.images
        LEFT JOIN y.metadata ON y.images.identifier == y.metadata.identifier
        LEFT JOIN y.captions ON y.images.caption == y.captions.id)
        '''
c2.execute(vtable_sql, )

In [None]:
        '''
        images.id, images.identifier, images.filename, images.path, 
        images.creator, images.vggpred,
        metadata.cat, metadata.authors, metadata.title, metadata.abstract, 
        captions.caption, captions.label
        '''

In [None]:
# insert rows into virtual table
vtable_sql = '''
                INSERT INTO vsingle
                SELECT id, identifier, filename, path, creator, vggpred,
                cat, authors, title, abstract, caption, label
                FROM single
                '''

c2.execute(vtable_sql, )

In [None]:
vsearch_sql = "SELECT identifier FROM vsingle WHERE vsingle MATCH ?"

c2.execute(vsearch_sql, ("ligeti", ))
rows = c2.fetchall()
print(len(rows))
for row in rows:
    print(row)