In [2]:
import pyarrow.feather as feather
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
import numpy as np

In [62]:
df = feather.read_feather('data/artsy_full.feather')

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27577 entries, 0 to 27576
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    27577 non-null  object 
 1   slug                  27577 non-null  object 
 2   title                 27576 non-null  object 
 3   category              27576 non-null  object 
 4   medium                27575 non-null  object 
 5   date                  27577 non-null  object 
 6   iconicity             27577 non-null  float64
 7   sold                  27577 non-null  bool   
 8   image_versions        27469 non-null  object 
 9   dimensions_height_in  14120 non-null  float64
 10  dimensions_width_in   13607 non-null  float64
 11  dimensions_depth_in   836 non-null    float64
 12  permalink             27577 non-null  object 
 13  api_link              27577 non-null  object 
 14  thumb_link            26482 non-null  object 
 15  image_link         

In [65]:
df.dropna(subset=['image_versions'], inplace=True)

In [66]:
def convert_to_list(value):
    value = value.tolist()
    
    return value

df['image_versions'] = df['image_versions'].apply(convert_to_list)

In [67]:
all_versions = dict.fromkeys(('large',
                             'large_rectangle',
                             'larger',
                             'medium',
                             'medium_rectangle',
                             'normalized',
                             'small',
                             'square',
                             'tall'),0)
def get_versions(value):
    for key in value:
        all_versions[key] += 1
    
df['image_versions'].apply(get_versions)

all_versions

{'large': 22671,
 'large_rectangle': 25813,
 'larger': 24826,
 'medium': 26482,
 'medium_rectangle': 23872,
 'normalized': 26878,
 'small': 23501,
 'square': 23563,
 'tall': 23087}

In [68]:
not_large = df[df['image_versions'].apply(lambda x: 'large' not in x[0])].image_versions

first_version_not_large = dict().fromkeys(all_versions, 0)

for version_list in not_large:
    if ('large' and 'medium') not in version_list:
        first_version_not_large[version_list[0]] += 1 
    
print(sum(first_version_not_large.values()))
first_version_not_large



70


{'large': 0,
 'large_rectangle': 0,
 'larger': 0,
 'medium': 0,
 'medium_rectangle': 30,
 'normalized': 36,
 'small': 2,
 'square': 2,
 'tall': 0}

In [77]:
def get_largest_version(row):
    
    image_versions = row['image_versions']
    
    if 'large' in image_versions:
        version = 'large'
    elif 'medium' in image_versions:
        version = 'medium'
    else:
        version = image_versions[0]
    
    row['main_image_version'] = version
    
    return row
    

df = df.apply(get_largest_version, axis=1)

In [78]:
df.main_image_version.value_counts()

large               22671
medium               4463
large_rectangle       201
larger                 64
normalized             36
medium_rectangle       30
square                  2
small                   2
Name: main_image_version, dtype: int64

In [81]:
feather.write_feather(df,'data/artsy_full_main_image_version.feather')