Quick code to copy only some of the data over from SQLite database to make things quicker and simpler for building web app

In [None]:
import sqlite3
import os
import time

In [None]:
db2_path = "/home/rte/data/db/arxiv_db_images_600k.sqlite3"

In [None]:
image_list = "/home/rte/data/paths/all_images_shuf.txt"
filepaths = []

NUM_INDEXES = 600000

with open(image_list, "r") as f:
    lines = f.readlines()
    print("length of image text file:",len(lines))
for l in lines[:NUM_INDEXES]:
    # substrings = l.rsplit(",", 1)
    filepaths.append(l.strip())
    # image_ids.append(substrings[1].strip())
print("length of filepaths:", len(filepaths))

In [None]:
if not os.path.isfile(db2_path):
    try:
        # create a database in RAM
        # db = sqlite3.connect(':memory:')
        # creates or opens a file database
        db2 = sqlite3.connect(db2_path)

        # get cursor object and create metadata table
        c2 = db2.cursor()
        c2.execute('''
            CREATE TABLE metadata(id INTEGER PRIMARY KEY, identifier TEXT, created TEXT, \
            cat TEXT, authors TEXT, title TEXT, abstract TEXT, licence TEXT)
        ''')

        # create images table
        c2.execute('''
            CREATE TABLE images (id INTEGER PRIMARY KEY, identifier TEXT, filename TEXT, \
            filesize INT, path TEXT, x INT, y INT, imageformat TEXT, creator TEXT)
        ''')

        # create captions table
        c2.execute('''
            CREATE TABLE "captions" ("id" INTEGER, "identifier" TEXT, "tex" TEXT, \
            "fignum" TEXT, "caption" TEXT, "label" TEXT, "filenames" TEXT, "image_ids" TEXT, PRIMARY KEY("id"))
        ''')

        db2.commit()

    except Exception as e:
        # Roll back any change if something goes wrong
        db2.rollback()
        raise e
    finally:
        # Close the db connection
        db2.close()
        print("database file created")
else:
    print("database file already exists")

In [None]:
# Here we import the sqlite3 database and create a cursor
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"
db = sqlite3.connect("file:" + db_path + "?mode=ro", uri=True)
c = db.cursor()

db2 = sqlite3.connect(db2_path)
c2 = db2.cursor()

In [None]:
c.execute('''
    SELECT * 
    FROM images 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c.fetchall()
for row in rows:
    print(row)

### copy rows for images table

In [None]:
start = time.time()

for f in filepaths[:]:
    sql = ('''
        SELECT *
        FROM images
        WHERE id IS ?
        ''')
    image_id = f.split(".jpg")[0]
#     print(image_id)
    c.execute(sql, (image_id, ))
    rows = c.fetchall()
#     for row in rows:
#         print(row)

    sql = ('''
        INSERT or REPLACE INTO images
        (id, identifier, filename, filesize, path, x, y, imageformat, creator)
        VALUES (?,?,?,?,?,?,?,?,?)
        ''')
    
    c2.execute(sql, *rows)
db2.commit()

print("process took {} seconds".format(time.time() - start))

### copy rows for metadata table

In [None]:
attach_sql = "ATTACH DATABASE ? AS DB1"
c2.execute(attach_sql, (db_path,))

In [None]:
duplicate_sql = '''
    INSERT INTO metadata SELECT * FROM DB1.metadata
    '''
c2.execute(duplicate_sql)
c2.execute("DETACH DATABASE DB1")
db2.commit()

In [None]:
test_sql = '''
    SELECT DISTINCT metadata.id 
    FROM metadata 
    LEFT JOIN images ON metadata.identifier = images.identifier
    WHERE metadata.id IS NULL;
    '''
c2.execute(test_sql)
rows = c2.fetchall()
print(len(rows))

In [None]:
test_sql = '''
    SELECT DISTINCT metadata.id 
    FROM metadata 
    WHERE metadata.identifier IN (SELECT images.identifier from images)
    '''
c2.execute(test_sql)
rows = c2.fetchall()
print(len(rows))

In [None]:
test_sql = '''
    SELECT images.identifier from images
    '''
c2.execute(test_sql)
rows = c2.fetchall()
print(len(rows))

In [None]:
for row in rows:
    print(row)

In [None]:
start = time.time()

delete_sql = '''
    DELETE 
    FROM metadata 
    WHERE metadata.identifier NOT IN (SELECT images.identifier from images)
    '''
c2.execute(delete_sql)
db2.commit()

print("process took {} seconds".format(time.time() - start))

In [None]:
# Find where there is no matching metadata for a given image

c2.execute('''
    SELECT count(images.identifier), images.identifier
    FROM images
    LEFT JOIN metadata ON metadata.identifier = images.identifier
    WHERE metadata.identifier IS NULL
    GROUP BY images.identifier
    ''')
rows = c2.fetchall()
for row in rows:
    print(row)

### copy data from captions table

#### duplicate

In [None]:
attach_sql = "ATTACH DATABASE ? AS DB1"
c2.execute(attach_sql, (db_path,))

In [None]:
duplicate_sql = '''
    INSERT INTO captions SELECT * FROM DB1.captions
    '''
c2.execute(duplicate_sql)
c2.execute("DETACH DATABASE DB1")
db2.commit()

Quick version, just check if captions.identifier is in metadata.identifier (this leaves many rows that we will never need)

In [None]:
start = time.time()

delete_sql = '''
    DELETE 
    FROM captions 
    WHERE captions.identifier NOT IN (SELECT metadata.identifier from metadata)
    '''
c2.execute(delete_sql)
db2.commit()

print("process took {} seconds".format(time.time() - start))