In [14]:
import boto3
import io
import pandas as pd
pd.set_option('display.max_colwidth', 100)

In [19]:
def read_parquet_from_s3(filepath, bucket='s-laion', s3=boto3.resource('s3')):
    buffer = io.BytesIO()
    object = s3.Object(bucket, filepath)
    object.download_fileobj(buffer)
    df = pd.read_parquet(buffer)

    return df

def get_filepaths(bucket='s-laion', prefix="vit-h-14-embeddings/metadata", s3=boto3.resource('s3')):
    bucket = s3.Bucket(bucket)
    filepaths = [obj.key for obj in bucket.objects.filter(Prefix=prefix)]

    return filepaths

In [20]:
from tqdm import tqdm

def get_max_size_df_from_s3(bucket, prefix="vit-h-14-embeddings/metadata", s3=boto3.resource('s3')):
    filepaths = get_filepaths(s3, bucket, prefix)

    max_size = -float('inf')

    for filepath in tqdm(filepaths):
        df = read_parquet_from_s3(filepath)
        if df.shape[0] > max_size:
            max_size = df.shape[0]

    return max_size

In [18]:
max_size = get_max_size_df_from_s3('s-laion')

100%|██████████| 10/10 [00:32<00:00,  3.25s/it]


In [34]:
filepaths = get_filepaths()
index = 10
df = read_parquet_from_s3(filepaths[index])

In [35]:
df['text_list'] = df['caption'].apply(lambda x: ["This",  f"has the caption {x}"])

In [36]:
df['image_info'] = df.apply(lambda x: [{"image_name": x['key'], "raw_url": x['url'], "matched_text_index": 1, "NSFW": x['NSFW'], "similarity_matrix": [x['similarity']], "width": x['width'], "height": x['height']}], axis=1)

In [37]:
df["image_index"] = -(index * max_size + df.reset_index().index)

In [38]:
df[['text_list', 'image_info', 'image_index']].head()

Unnamed: 0,text_list,image_info,image_index
0,"[This, has the caption police captain resume example http www resumecareer info]","[{'image_name': '2082700000', 'raw_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcSQmh7MwpaApmW...",-9389070
1,"[This, has the caption Crock Pot Dressing For Thanksgiving Crockpot Corn Bread Stuffing Recipe A...","[{'image_name': '2082700021', 'raw_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcRtsWXOOj7CDNz...",-9389071
2,"[This, has the caption Ship ashore - Dilapidated old ship run aground.]","[{'image_name': '2082700045', 'raw_url': 'https://cdn.xxl.thumbs.canstockphoto.com/ship-ashore-d...",-9389072
3,"[This, has the caption Portrait Of Young Man Cleaning The Floor With Mop In Office Stok Fotoğraf]","[{'image_name': '2082700022', 'raw_url': 'https://us.123rf.com/450wm/andreypopov/andreypopov1311...",-9389073
4,"[This, has the caption Linon 62 Inch Linen Tufted Bench]","[{'image_name': '2082700047', 'raw_url': 'https://ak1.ostkcdn.com/images/products/9303659/P16465...",-9389074
