# Slice DB

Code to manipulate database based on number of random images. Used to reduce database to only needed rows. Uses a list of image IDs and then copies data from the original database.

In [None]:
import sqlite3
import os
import time
import json
import numpy as np
import time

In [None]:
db2_path = "/home/rte/data/db/arxiv_db_images_600k.sqlite3"

In [None]:
image_list = "/home/rte/data/paths/all_images_shuf.txt"
filepaths = []

NUM_INDEXES = 600000

with open(image_list, "r") as f:
    lines = f.readlines()
    print("length of image text file:",len(lines))
for l in lines[:NUM_INDEXES]:
    # substrings = l.rsplit(",", 1)
    filepaths.append(l.strip())
    # image_ids.append(substrings[1].strip())
print("length of filepaths:", len(filepaths))

In [None]:
if not os.path.isfile(db2_path):
    try:
        # create a database in RAM
        # db = sqlite3.connect(':memory:')
        # creates or opens a file database
        db2 = sqlite3.connect(db2_path)

        # get cursor object and create metadata table
        c2 = db2.cursor()
        c2.execute('''
            CREATE TABLE metadata(id INTEGER PRIMARY KEY, identifier TEXT, created TEXT, \
            cat TEXT, authors TEXT, title TEXT, abstract TEXT, licence TEXT)
        ''')

        # create images table
        c2.execute('''
            CREATE TABLE images (id INTEGER PRIMARY KEY, identifier TEXT, filename TEXT, \
            filesize INT, path TEXT, x INT, y INT, imageformat TEXT, creator TEXT)
        ''')

        # create captions table
        c2.execute('''
            CREATE TABLE "captions" ("id" INTEGER, "identifier" TEXT, "tex" TEXT, \
            "fignum" TEXT, "caption" TEXT, "label" TEXT, "filenames" TEXT, "image_ids" TEXT, PRIMARY KEY("id"))
        ''')

        db2.commit()

    except Exception as e:
        # Roll back any change if something goes wrong
        db2.rollback()
        raise e
    finally:
        # Close the db connection
        db2.close()
        print("database file created")
else:
    print("database file already exists")

In [None]:
# Here we import the sqlite3 database and create a cursor
db_path = "/home/rte/data/db/arxiv_db_images.sqlite3"
db = sqlite3.connect("file:" + db_path + "?mode=ro", uri=True)
c = db.cursor()

db2 = sqlite3.connect(db2_path)
c2 = db2.cursor()

In [None]:
# test by getting one row
c.execute('''
    SELECT * 
    FROM images 
    ORDER BY ROWID ASC 
    LIMIT 1
''')
rows = c.fetchall()
for row in rows:
    print(row)

### copy rows for images table

This finds all image rows in the first database and copies them to the second.

In [None]:
start = time.time()

for f in filepaths[:]:
    sql = ('''
        SELECT *
        FROM images
        WHERE id IS ?
        ''')
    image_id = f.split(".jpg")[0]
#     print(image_id)
    c.execute(sql, (image_id, ))
    rows = c.fetchall()
#     for row in rows:
#         print(row)

    sql = ('''
        INSERT or REPLACE INTO images
        (id, identifier, filename, filesize, path, x, y, imageformat, creator)
        VALUES (?,?,?,?,?,?,?,?,?)
        ''')
    
    c2.execute(sql, *rows)
db2.commit()

print("process took {} seconds".format(time.time() - start))

### copy rows for metadata table

In [None]:
attach_sql = "ATTACH DATABASE ? AS DB1"
c2.execute(attach_sql, (db_path,))

In [None]:
duplicate_sql = '''
    INSERT INTO metadata SELECT * FROM DB1.metadata
    '''
c2.execute(duplicate_sql)
c2.execute("DETACH DATABASE DB1")
db2.commit()

In [None]:
test_sql = '''
    SELECT DISTINCT metadata.id 
    FROM metadata 
    LEFT JOIN images ON metadata.identifier = images.identifier
    WHERE metadata.id IS NULL;
    '''
c2.execute(test_sql)
rows = c2.fetchall()
print(len(rows))

In [None]:
test_sql = '''
    SELECT DISTINCT metadata.id 
    FROM metadata 
    WHERE metadata.identifier IN (SELECT images.identifier from images)
    '''
c2.execute(test_sql)
rows = c2.fetchall()
print(len(rows))

In [None]:
test_sql = '''
    SELECT images.identifier from images
    '''
c2.execute(test_sql)
rows = c2.fetchall()
print(len(rows))

In [None]:
for row in rows:
    print(row)

In [None]:
start = time.time()

delete_sql = '''
    DELETE 
    FROM metadata 
    WHERE metadata.identifier NOT IN (SELECT images.identifier from images)
    '''
c2.execute(delete_sql)
db2.commit()

print("process took {} seconds".format(time.time() - start))

In [None]:
# Find where there is no matching metadata for a given image

c2.execute('''
    SELECT count(images.identifier), images.identifier
    FROM images
    LEFT JOIN metadata ON metadata.identifier = images.identifier
    WHERE metadata.identifier IS NULL
    GROUP BY images.identifier
    ''')
rows = c2.fetchall()
for row in rows:
    print(row)

### copy data from captions table

#### duplicate

In [None]:
attach_sql = "ATTACH DATABASE ? AS DB1"
c2.execute(attach_sql, (db_path,))

In [None]:
# drop table
c2.execute("DROP TABLE CAPTIONS")
c2.execute('''
    CREATE TABLE "captions" ("id" INTEGER, "identifier" TEXT, "tex" TEXT, \
    "fignum" TEXT, "caption" TEXT, "label" TEXT, "filenames" TEXT, "image_ids" TEXT, PRIMARY KEY("id"))
''')

In [None]:
duplicate_sql = '''
    INSERT INTO captions SELECT * FROM DB1.captions
    '''
c2.execute(duplicate_sql)
c2.execute("DETACH DATABASE DB1")
# db2.commit()

In [None]:
# in case DB1 is locked
db2.commit()

Quick version, just check if captions.identifier is in metadata.identifier (this leaves many rows that we will never need)

In [None]:
# start = time.time()

# delete_sql = '''
#     DELETE 
#     FROM captions 
#     WHERE captions.identifier NOT IN (SELECT metadata.identifier from metadata)
#     '''
# c2.execute(delete_sql)
# db2.commit()

# print("process took {} seconds".format(time.time() - start))

Read all of `captions.image_ids` and check where these match up with `images.id`

In [None]:
# get all captions from database
find_sql = '''
    SELECT id, image_ids
    FROM captions
    '''
c2.execute(find_sql)
captions = c2.fetchall()
print(f'total number of caption entries: {len(captions)}')

In [None]:
# diagnostics
print(captions[:100])
print(len(captions))
for caption_id, caption_image_ids in captions[:3]:
    print(caption_id, caption_image_ids)

In [None]:
# get all images from database
image_sql = '''
    SELECT id
    FROM images
    '''
c2.execute(image_sql)
images = c2.fetchall()
print(f'total number of image ids: {len(images)}')

In [None]:
for image in images[:10]:
    print(image)

In [None]:
all_images = []
for image in images[:]:
#     print(image[0])
    all_images.append(image[0])

In [None]:
print(len(all_images))
print(all_images[:20])
print(all_images[0])
print(all_images[-10:])

Go through all of the captions, check if there is one or more `image_ids` listed. If so, for each of those check if that `image_id` is in the 600k list of images. Then save the matching `caption_id` in a list. Later, delete any entries in the `captions` table that isn't in this list.

NB: This will take a long time.

`matches = [caption_id, c_id]`

In [None]:
matches = []
check_string = "\|"
for caption_id, caption_image_ids in captions[:]:
#     print(caption_id, caption_image_ids)
    if caption_image_ids != None:
#         print(f'{caption[1]}: {caption[0].split(check_string)}')
#         match = False
        # get each individual c_id from the string of image_ids
        for c_id in caption_image_ids.split(check_string):
#             print(f'{caption[1]}: {image_id}')
#             print(f'{type(caption[1])}: {type(image_id)}')
            if int(c_id) in all_images:
#                 match = True
#                 print("found match")
                # create an entry for each match - caption id, associated image id, 
                match_entry = [caption_id, c_id]
                matches.append(match_entry)
print(f'Found a total of {len(matches)} matches')

In [None]:
# with open('caption_matches.json', 'w') as outfile:
#     json.dump(matches, outfile)

In [None]:
with open('caption_matches.json', 'r') as infile:
    matches = json.load(infile)

For each `c_id` entry in `all_captions`, check if also in `matches`, if not delete the caption entry from db.  

In [None]:
a_matches = np.array(matches, dtype=int)
num_matches = 0
matches_c_ids = a_matches[:, 0]
print(matches_c_ids)
print(len(matches_c_ids))
print(matches_c_ids.shape)

delete_sql = '''
    DELETE FROM captions
    WHERE id = ?
'''

last_match = 0
begin_timer = time.time()
start = time.time()
for i, (c_id, image_ids) in enumerate(captions[:]):
    if (i % 1000) == 0:
        print(f'{i} -- time {time.time() - start}')
        start = time.time()
        db2.commit()
    before_index = np.searchsorted(matches_c_ids, last_match)
#     print(f'before_index {before_index}')
    for index, matches_c_id in enumerate(matches_c_ids[before_index:]):
#         print(matches_c_id, c_id)
        if matches_c_id == c_id:
#             print("match!")
            num_matches += 1
            last_match = c_id
            break
        if matches_c_id > c_id:
#             print("past point in db")
            c2.execute(delete_sql, (c_id,))
            break
print(num_matches)
print(f'total time taken: {time.time() - begin_timer}')
db2.commit()

For each `image_id` in `all_images`, check if there is already a `caption_id`, if not, add the `c_id`.

In [None]:
# create a new column in images with caption_id
sql_alter = '''
            ALTER TABLE images
            ADD caption INTEGER
            '''
c2.execute(sql_alter, )
db2.commit()

In [None]:
sql_set_caption = '''
        UPDATE images
        SET caption = ?
        WHERE id = ?
'''

# print(matches_c_ids)
# print(len(matches_c_ids))
# print(matches_c_ids.shape)

start = time.time()
for caption_id, c_id in reversed(matches[:]):
#     print(caption_id, c_id)
    c2.execute(sql_set_caption, (caption_id, c_id))
print(f'Done. Time {time.time() - start}')
db2.commit()

Check how many entries have a caption

In [None]:
c2.execute("SELECT id, caption FROM images")
rows = c2.fetchall()

count = 0
print(len(rows))
for row in rows:
    if row[1] is None:
        count += 1
print(f'total number of missing entries {count}')
print(f'number of entries with caption {len(rows) - count}')

#### Clean up

In [None]:
c2.execute("VACUUM")
db2.commit()