In [1]:
from PIL import Image
import pandas as pd
import requests
import psycopg2
import io 
from ratelimiter import RateLimiter
from tqdm import tqdm
from ast import literal_eval

In [2]:
#postgres connection
db_user = 'postgres'
db_password = ''
db_host = 'localhost'
db_port = 5432
database = 'met_data'

conn_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{database}'
conn = psycopg2.connect(conn_str)
conn.autocommit = True

In [3]:
#creates new table to save images into along with size information
command = """CREATE TABLE raw_images(
            id int PRIMARY KEY,
            object_id int,
            x_pix int,
            y_pix int,
            path text);"""

In [4]:
#executes above command
cursor = conn.cursor()
cursor.execute(command)
cursor.close

DuplicateTable: relation "raw_images" already exists


In [5]:
#function to scrape image from internet, get its size information, save image to drive, and save all information into postgres
#only works for first images
#function wrapper to limit scrape speed. Met asks to keep requests to 80 per second
@RateLimiter(max_calls=80, period=1)
def init_image(i, url, object_id):
    r = requests.get(url)
    with io.BytesIO(r.content) as j: 
        image_pil = Image.open(j)
        x_pix, y_pix = image_pil.size
        path = f'##filepath##/img{object_id}_0.jpg'
        image_pil.save(path)
        command = f"""INSERT INTO raw_images(id,
                object_id,
                x_pix,
                y_pix,
                path)
                    VALUES ({i},{object_id}, {x_pix}, {y_pix}, $${path}$$)"""
        cursor = conn.cursor()
        cursor.execute(command)
        cursor.close
    

In [6]:
#bring in urls being stored in sql
query = """SELECT * FROM image_url;"""
df = pd.read_sql(query, con=conn)

In [8]:
#starting with index 0, goes through dataframe and uses init image function on primary images
#tot used to track progress in cases of errors
tot = 0
for i in tqdm(range(len(df.iloc[tot:,:]))):
    init_image(i, df['link'].iloc[i], df['id'].iloc[i])
    tot+=1

  0%|          | 0/232802 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '##filepath##/img34_0.jpg'

In [None]:
#secondary images were presented as lists, meaning additional preprocessing was necessary
#entries were not being identified as lists. literal_eval converted them into lists
df['other_links'] = df.other_links.apply(literal_eval)
#separate out necessary info into second df for ease of use
other_pics_df = df[['id', 'other_links']]
#explode lists of links so each link is its own row
other_pics_df = other_pics_df.explode('other_links')
#drop any duplicate links
other_pics_df= other_pics_df.dropna(subset=['other_links'])


In [None]:
#iterate through other_pics_df and create list of new image IDs based on object id and number of images
image_id_list = []
tot_2 =0
for element in other_pics_df.id.unique():
    for entry in range(len(other_pics_df[other_pics_df.id == element])):
        image_id_list.append(f'{element}_{entry+1}')
    

In [None]:
#add image ID list to df
other_pics_df['image_id'] =image_id_list

In [None]:
#save as csv for backup
other_pics_df.to_csv('secondary_pics_links.csv')

In [None]:
#create second function that operates much like the first with the exception that the name of the image is saved using the new image IDs
@RateLimiter(max_calls=80, period=1)
def init_secondary_images(i, entry, image_id, object_id):
    r = requests.get(entry)
    with io.BytesIO(r.content) as j: 
        image_pil = Image.open(j)
        x_pix, y_pix = image_pil.size
        path = f'##filepath##/img{image_id}.jpg'
        image_pil.save(path)
        command = f"""INSERT INTO raw_images(id,
                object_id,
                x_pix,
                y_pix,
                path)
                    VALUES ({i},{object_id}, {x_pix}, {y_pix}, $${path}$$)"""
        cursor = conn.cursor()
        cursor.execute(command)
        cursor.close
    
    

In [None]:
#counter for restarts
start = 0

In [None]:
#iterate through secondary image links, scrape, save files to hard drive, save info as sql
for i in tqdm(range(len(other_pics_df))[start:]):
    try:
        init_secondary_images(start, other_pics_df['other_links'].iloc[i], other_pics_df['image_id'].iloc[i], other_pics_df.id.iloc[i])
        start+=1
    except:
        print('pass')
        start+=1   