In [None]:
import psycopg2
import pandas as pd
import re 
from matplotlib import pyplot as plt
from PIL import Image
from tqdm import tqdm

In [None]:
#set pd display higher to be able to see lists for cleaning up
pd.options.display.max_rows = 100

In [None]:
#postgres connection
db_user = 'postgres'
db_password = ''
db_host = 'localhost'
db_port = 5432
database = 'met_data'

conn_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{database}'
conn = psycopg2.connect(conn_str)
conn.autocommit = True

In [None]:
#select object ID and name from object information and relevant image info from raw images
query = """SELECT 
            object_information.object_id,
            object_information.object_name,
            raw_images.id,
            raw_images.path,
            raw_images.x_pix,
            raw_images.y_pix
            FROM object_information
            JOIN raw_images
            ON object_information.object_id = raw_images.object_id
            WHERE object_information.object_name SIMILAR TO'%(V|v)ase%'
"""
raw_vases = pd.read_sql(query, con=conn)

In [None]:
#look at if any undesired objects made it into the df
raw_vases.object_name.value_counts()

In [None]:
#only real issue is a number of items with fragment in the name that may not work well, so want to remove those
#list comprehension to get object names containg fragment
frags = [i for i in raw_vases.object_name.unique() if re.search(r'\W*(F|f)ragment*', i)]


In [None]:
#select all objects whose name are not in fragments list
raw_vases = raw_vases[~raw_vases.object_name.isin(frags)]

In [None]:
#create h/w ratio column to help decide what ratio to save images as
raw_vases['ratio'] =raw_vases.y_pix / raw_vases.x_pix

In [None]:
#plot ratio
raw_vases.ratio.hist(bins = 12)
plt.show()

In [None]:
#plot h and w, together with ratio, will determine what size images are saved as
plt.hist(raw_vases.x_pix)
plt.hist(raw_vases.y_pix)
plt.show()

In [None]:
#based on the info provided above, decided to use images at a 5:4 ratio with h = 2000, w = 1600
#for ease of use with github, I reduced dimensions used to include a copy of the data under 100 mb
#halved in size 
new_dir = '##new_directory##'
for i in tqdm(range(len(raw_vases))):
    try:
        img = Image.open(raw_vases.path[i]).resize((384, 480))
        img.save(new_dir+f'vase_{raw_vases.id[i]}.jpg')
    except:
        print('image not found')

In [None]:
import shutil
new_dir = '##new_directory##'
shutil.make_archive('##data_directory##', 'zip', new_dir)
