Quick code to copy only some of the data over from SQLite database to make things quicker and simpler for building web app

In [1]:
import sqlite3
import os
import time

In [2]:
db2_path = "/home/rte/data/db/arxiv_db_images_600k.sqlite3"

In [3]:
image_list = "/home/rte/data/paths/all_images_shuf.txt"
filepaths = []

NUM_INDEXES = 600000

with open(image_list, "r") as f:
    lines = f.readlines()
    print("length of image text file:",len(lines))
for l in lines[:NUM_INDEXES]:
    # substrings = l.rsplit(",", 1)
    filepaths.append(l.strip())
    # image_ids.append(substrings[1].strip())
print("length of filepaths:", len(filepaths))

length of image text file: 12694664
length of filepaths: 600000


In [4]:
if not os.path.isfile(db2_path):
    try:
        # create a database in RAM
        # db = sqlite3.connect(':memory:')
        # creates or opens a file database
        db2 = sqlite3.connect(db2_path)

        # get cursor object and create metadata table
        c2 = db2.cursor()
        c2.execute('''
            CREATE TABLE metadata(id INTEGER PRIMARY KEY, identifier TEXT, created TEXT, \
            cat TEXT, authors TEXT, title TEXT, abstract TEXT, licence TEXT)
        ''')

        # create images table
        c2.execute('''
            CREATE TABLE images (id INTEGER PRIMARY KEY, identifier TEXT, filename TEXT, \
            filesize INT, path TEXT, x INT, y INT, imageformat TEXT, creator TEXT)
        ''')

        # create captions table
        c2.execute('''
            CREATE TABLE "captions" ("id" INTEGER, "identifier" TEXT, "tex" TEXT, \
            "fignum" TEXT, "caption" TEXT, "label" TEXT, "filenames" TEXT, "image_ids" TEXT, PRIMARY KEY("id"))
        ''')

        db2.commit()

    except Exception as e:
        # Roll back any change if something goes wrong
        db2.rollback()
        raise e
    finally:
        # Close the db connection
        db2.close()
        print("database file created")
else:
    print("database file already exists")

database file already exists


In [5]:
# Here we import the sqlite3 database and create a cursor
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"
db = sqlite3.connect("file:" + db_path + "?mode=ro", uri=True)
c = db.cursor()

db2 = sqlite3.connect(db2_path)
c2 = db2.cursor()

In [6]:
c.execute('''
    SELECT * 
    FROM images 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c.fetchall()
for row in rows:
    print(row)

(1, 'cond-mat0606521', 'lay_f2.eps', 29311, './0606/cond-mat0606521', 250, 180, 'PS', '-')


### copy rows for images table

In [None]:
start = time.time()

for f in filepaths[:]:
    sql = ('''
        SELECT *
        FROM images
        WHERE id IS ?
        ''')
    image_id = f.split(".jpg")[0]
#     print(image_id)
    c.execute(sql, (image_id, ))
    rows = c.fetchall()
#     for row in rows:
#         print(row)

    sql = ('''
        INSERT or REPLACE INTO images
        (id, identifier, filename, filesize, path, x, y, imageformat, creator)
        VALUES (?,?,?,?,?,?,?,?,?)
        ''')
    
    c2.execute(sql, *rows)
db2.commit()

print("process took {} seconds".format(time.time() - start))

### copy rows for metadata table

In [None]:
attach_sql = "ATTACH DATABASE ? AS DB1"
c2.execute(attach_sql, (db_path,))

In [None]:
duplicate_sql = '''
    INSERT INTO metadata SELECT * FROM DB1.metadata
    '''
c2.execute(duplicate_sql)
c2.execute("DETACH DATABASE DB1")
db2.commit()

In [None]:
test_sql = '''
    SELECT DISTINCT metadata.id 
    FROM metadata 
    LEFT JOIN images ON metadata.identifier = images.identifier
    WHERE metadata.id IS NULL;
    '''
c2.execute(test_sql)
rows = c2.fetchall()
print(len(rows))

In [None]:
test_sql = '''
    SELECT DISTINCT metadata.id 
    FROM metadata 
    WHERE metadata.identifier IN (SELECT images.identifier from images)
    '''
c2.execute(test_sql)
rows = c2.fetchall()
print(len(rows))

In [None]:
test_sql = '''
    SELECT images.identifier from images
    '''
c2.execute(test_sql)
rows = c2.fetchall()
print(len(rows))

In [None]:
for row in rows:
    print(row)

In [None]:
start = time.time()

delete_sql = '''
    DELETE 
    FROM metadata 
    WHERE metadata.identifier NOT IN (SELECT images.identifier from images)
    '''
c2.execute(delete_sql)
db2.commit()

print("process took {} seconds".format(time.time() - start))

In [None]:
# Find where there is no matching metadata for a given image

c2.execute('''
    SELECT count(images.identifier), images.identifier
    FROM images
    LEFT JOIN metadata ON metadata.identifier = images.identifier
    WHERE metadata.identifier IS NULL
    GROUP BY images.identifier
    ''')
rows = c2.fetchall()
for row in rows:
    print(row)

### copy data from captions table

#### duplicate

In [48]:
attach_sql = "ATTACH DATABASE ? AS DB1"
c2.execute(attach_sql, (db_path,))

<sqlite3.Cursor at 0x7f25bc5ac810>

In [49]:
# drop table
c2.execute("DROP TABLE CAPTIONS")
c2.execute('''
    CREATE TABLE "captions" ("id" INTEGER, "identifier" TEXT, "tex" TEXT, \
    "fignum" TEXT, "caption" TEXT, "label" TEXT, "filenames" TEXT, "image_ids" TEXT, PRIMARY KEY("id"))
''')

<sqlite3.Cursor at 0x7f25bc5ac810>

In [50]:
duplicate_sql = '''
    INSERT INTO captions SELECT * FROM DB1.captions
    '''
c2.execute(duplicate_sql)
c2.execute("DETACH DATABASE DB1")
db2.commit()

OperationalError: database DB1 is locked

In [51]:
# in case DB1 is locked
db2.commit()

Quick version, just check if captions.identifier is in metadata.identifier (this leaves many rows that we will never need)

In [None]:
# start = time.time()

# delete_sql = '''
#     DELETE 
#     FROM captions 
#     WHERE captions.identifier NOT IN (SELECT metadata.identifier from metadata)
#     '''
# c2.execute(delete_sql)
# db2.commit()

# print("process took {} seconds".format(time.time() - start))

Read all of `captions.image_ids` and check where these match up with `images.id`

In [95]:
# get all captions from database
find_sql = '''
    SELECT id, image_ids
    FROM captions
    '''
c2.execute(find_sql)
captions = c2.fetchall()
print(f'total number of caption entries: {len(captions)}')

total number of caption entries: 7392343


In [98]:
# diagnostics
print(captions[:100])
print(len(captions))
for caption_id, caption_image_ids in captions[:3]:
    print(caption_id, caption_image_ids)

[(1, '4'), (2, '6'), (3, '7'), (4, '3'), (5, '8'), (6, '5'), (7, '10'), (8, '9'), (9, '11'), (10, '12'), (11, '14'), (12, '13'), (13, '17'), (14, '18'), (15, '15'), (16, '16'), (17, '22'), (18, '21'), (19, '20'), (20, '19'), (21, '23'), (22, '24'), (23, '25'), (24, '26'), (25, '27'), (26, '28'), (27, '30'), (28, '32'), (29, '31'), (30, '29'), (31, '42'), (32, '38'), (33, '36'), (34, '41'), (35, '34'), (36, '33'), (37, '35'), (38, '40'), (39, '39'), (40, '37'), (41, '43'), (42, '45'), (43, '46'), (44, '47'), (45, '44'), (46, '48'), (47, None), (48, None), (49, None), (50, '49'), (51, '53'), (52, '56'), (53, '60'), (54, '65'), (55, '61'), (56, '63'), (57, '52'), (58, '66'), (59, '59'), (60, '54'), (61, '68'), (62, '55\\|58'), (63, '51\\|62\\|67'), (64, '57'), (65, '50\\|64'), (66, None), (67, '69'), (68, '87'), (69, '88'), (70, '70'), (71, '82'), (72, '71'), (73, '75'), (74, '86'), (75, '77'), (76, '80'), (77, '76'), (78, '84'), (79, '74'), (80, '85'), (81, '83'), (82, '81'), (83, '89'),

In [53]:
# get all images from database
image_sql = '''
    SELECT id
    FROM images
    '''
c2.execute(image_sql)
images = c2.fetchall()
print(f'total number of image ids: {len(images)}')

total number of image ids: 600000


In [99]:
for image in images[:10]:
    print(image)

(6907525,)
(6896440,)
(6887309,)
(6887316,)
(6885999,)
(6901264,)
(6895830,)
(6895833,)
(6901093,)
(6892386,)


In [40]:
all_images = []
for image in images[:]:
#     print(image[0])
    all_images.append(image[0])

In [87]:
print(len(all_images))
print(all_images[:20])
print(all_images[0])

600000
[6907525, 6896440, 6887309, 6887316, 6885999, 6901264, 6895830, 6895833, 6901093, 6892386, 6887055, 6891895, 6888405, 6894299, 6894300, 6904756, 6906633, 6897656, 6900020, 6900024]
6907525


Go through all of the captions, check if there is one or more `image_ids` listed. If so, for each of those check if that `image_id` is in the 600k list of images. Then save the matching `caption_id` in a list. Later, delete any entries in the `captions` table that isn't in this list.

`matches = [caption_id, c_id]`

In [None]:
matches = []
check_string = "\|"
for caption_id, caption_image_ids in captions[:]:
#     print(caption_id, caption_image_ids)
    if caption_image_ids != None:
#         print(f'{caption[1]}: {caption[0].split(check_string)}')
#         match = False
        # get each individual c_id from the string of image_ids
        for c_id in caption_image_ids.split(check_string):
#             print(f'{caption[1]}: {image_id}')
#             print(f'{type(caption[1])}: {type(image_id)}')
            if int(c_id) in all_images:
#                 match = True
#                 print("found match")
                # create an entry for each match - caption id, associated image id, 
                match_entry = [caption_id, c_id]
                matches.append(match_entry)
print(f'Found a total of {len(matches)} matches')

In [None]:
import json
with open('caption_matches.json', 'w') as outfile:
    json.dump(matches, outfile)

In [105]:
print(matches[:10])

[[76, '80'], [133, '142'], [135, '151'], [217, '256'], [222, '275'], [222, '276'], [226, '283'], [253, '304'], [263, '314'], [264, '316']]


For each `c_id` entry in `all_captions`, check if also in `matches`, if not delete from db.  

For each `image_id` in `all_images`, check if there is already a `caption_id`, if not, add the `c_id`.

In [104]:
print(len(matches))

27


In [77]:
for image in all_images[:5]:
    print(f'image {image} is type {type(image)}')

image 6907525 is type <class 'int'>
image 6896440 is type <class 'int'>
image 6887309 is type <class 'int'>
image 6887316 is type <class 'int'>
image 6885999 is type <class 'int'>


In [83]:
matches[:10]

['80', '142', '151', '256', '275', '276', '283', '304', '314', '316']

In [None]:
for i, row in enumerate(rows):
    for image in images:
        for 

In [None]:
delete_sql = '''
    
'''
for match in matches:
    