In [1]:
from glob import glob
from os.path import join
import pandas as pd

from modules.metadata import PhotoMetadata
from modules.utilities import str_to_datetime
from templating.post import Post

location_data_dirpath = '../../data/locations/'
photo_metadata_dirpath = '../../data/photo_metadata'
pings_data_path = join(location_data_dirpath, 'pings.hdf')
photo_metadata_path = join(photo_metadata_dirpath, 'photo_metadata.hdf')
photo_gps_path = join(photo_metadata_dirpath, 'photo_gps.hdf')
photo_semantic_locations_path = join(photo_metadata_dirpath, 'photo_semantic_locations.hdf')

user_captions_path = join(photo_metadata_dirpath, 'user_captions.xlsx')
user_locations_path = join(photo_metadata_dirpath, 'user_gps.xlsx')
user_exclusions_path = join(photo_metadata_dirpath, 'user_hidden_photos.xlsx')

imgur_data_path = join(photo_metadata_dirpath, 'imgur_data.hdf')

%reload_ext autoreload
%autoreload 2

# Load photo metadata

In [2]:
# load metadata
photo_metadata = PhotoMetadata(photo_metadata_path)

# load imgur URLs
imgur_data = pd.read_hdf(imgur_data_path, 'data')

# load photo GPS data
photo_gps = pd.read_hdf(photo_gps_path, 'data')

# load matched semantic locations
photo_semantic_locations = pd.read_hdf(photo_semantic_locations_path, 'data')

# load user-curated data
user_captions = pd.read_excel(user_captions_path).set_index(PhotoMetadata.INDEX)
user_locations = pd.read_excel(user_locations_path).set_index(PhotoMetadata.INDEX)
user_exclusions = pd.read_excel(user_exclusions_path).set_index(PhotoMetadata.INDEX)

# Generate post data

In [3]:
# append ping-based location data where GPS tag is missing
posts = photo_metadata.data.join(photo_gps, on='timestamp', lsuffix='_native', rsuffix='_ping')
posts['latitude'] = posts.latitude_native.fillna(posts.latitude_ping)
posts['longitude'] = posts.longitude_native.fillna(posts.longitude_ping)

# exclude unreliable pings
max_dt_ping = 3600/8 # 15 minutes
no_gps_data = (~posts.geotagged & (posts.dt_ping >= max_dt_ping))
posts.loc[no_gps_data, ['latitude', 'longitude']] = float('nan')

# append manually-curated captions
posts = posts.join(user_captions)

# append nearest semantic location and use it to fill missing captions
posts = posts.join(photo_semantic_locations, how='left', rsuffix='_auto')

  raw_cell, store_history, silent, shell_futures)


# Curate captions

In [4]:
from copy import deepcopy
from PIL import Image
from IPython.display import display, clear_output
import clipboard

def parse(s):
    if s.strip() == '':
        return clipboard.paste()
    else:
        return s


def prompt(record, prefix):
    try:
        SUGGESTED_CAPTION = record.caption_auto    
        if type(SUGGESTED_CAPTION) != str:
            SUGGESTED_CAPTION = 'PLACEHOLDER_CAPTION'
        clipboard.copy(SUGGESTED_CAPTION)
        USER_CAPTION = parse(input(prefix + SUGGESTED_CAPTION))
        return USER_CAPTION
        
    except:
        print('Try again.')
        return prompt(record, prefix)
    
    
def GUI(record, prefix=''):
    im = Image.open(record.path)
    im.thumbnail((300,300))
    display(im)
    USER_CAPTION = prompt(record, prefix)
    im.close()
    clear_output()    
    return USER_CAPTION

In [5]:
CAPTION_DATA = deepcopy(posts)[['path', 'caption', 'caption_auto']]

In [14]:
MISSING_CAPTIONS = CAPTION_DATA[CAPTION_DATA.caption.isna()]

i = 0
USER_RECORDS = []
for idx, record in MISSING_CAPTIONS.iterrows():
    
    prefix = '{:d}/{:d} '.format(i+1, len(MISSING_CAPTIONS)) 
    
    USER_CAPTION = GUI(record, prefix)
    
    album, filename, source = idx
    USER_RECORD = dict(album=album, 
                       filename=filename, 
                       source=source, 
                       caption=USER_CAPTION)
    
    USER_RECORDS.append(USER_RECORD)
    
    i += 1
    if i > 50:
        break
        
NEW_USER_CAPTIONS = pd.DataFrame(USER_RECORDS).set_index(['album', 'filename', 'source'])
CAPTION_DATA['caption'] = CAPTION_DATA.caption.fillna(NEW_USER_CAPTIONS.caption)

In [None]:
# save captions
#pd.DataFrame(CAPTION_DATA.caption.to_frame().to_records()).to_excel(user_captions_path, index=False)