In [None]:
# !pip install flickrapi
# !pip install geopy
# !pip install python-datamuse

In [None]:
import flickrapi
from pprint import pprint
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import time
import datamuse
import json
from geopy.geocoders import Nominatim
from geopy.geocoders import GoogleV3
import csv
import io
import os
import urllib

api_key = '1107afb83cb2dc63722f34bf5257f908'
secret = '589b61bab623b7ba'
# flickr_api.set_keys(api_key = api_key, api_secret = secret)

flickr = flickrapi.FlickrAPI(api_key, secret, format = 'parsed-json')

In [None]:
def get_top3_tags(main_tag):
    '''
    Input: A main tag with a type of String
    
    Output: Returns the top 3 tags for each cluster that is related to the main tag inputted.
    '''
    top3_tags = []
    
    cluster_list = flickr.tags.getClusters(tag = main_tag)["clusters"]["cluster"]
    tag_list = cluster_list[1]["tag"][0:3]

    for cluster in cluster_list:
        tag_list = cluster["tag"][0:3]
        top3_tags_per_cluster = []
        for tag in tag_list:
            top3_tags_per_cluster.append(tag.get("_content"))
        top3_tags.append(top3_tags_per_cluster)
    return top3_tags



In [None]:
def get_user_ids(main_tag, top3_tags):
    '''
    Returns a list of user_ids for the top 24 photos returned based on the inputted tag and cluster tags related to it
    '''
    user_id_and_tags = {}
    start_time = time.time()
    user_ids = []
#     Obtain the corresponding list of main and cluster tags used to find the user
    for tags in top3_tags:
        
        try:
            tags_to_string = '-'.join(tags)
            user_list = flickr.tags.getClusterPhotos(tag=main_tag, cluster_id=tags_to_string)["photos"]["photo"]
        except:
            pass
        for user in user_list:
            user_id_and_tags['main_tag'] = main_tag
            user_id_and_tags['cluser_tags'] = tags_to_string
            user_id_and_tags['owner'] = user['owner']
            user_ids.append(user_id_and_tags.copy())
    print(f"Number of user_ids: {len(user_ids)}")
    return user_ids

In [None]:
def get_ids_from_groups(group_ids):
    '''
    Input: A list of group_ids
    Output: A list of user_ids found in the group. Only works for groups with public access
    '''
    member_ids = []
    all_pages = []
#     Obtain user_ids for all members in a list of group_ids
    for group in group_ids:
        try:
            num_pages = flickr.groups.members.getList(group_id = group, per_page = 500, page = 1)["members"]["pages"]
            for page in range(num_pages):
                all_pages.append(flickr.groups.members.getList(group_id = group, page = page))
                member_list = flickr.groups.members.getList(group_id = group, per_page = 500, page = page)["members"]["member"]
                for member in member_list:
                    member_ids.append(member["nsid"])
        except:
            print("Error in getting members from Group")
            pass
    print(f'User_ids from groups: {len(member_ids)}')
    return member_ids

In [None]:
def filter_users(user_ids):
    '''
    Returns a list of user_ids who have been on Flickr for more than 2 years and have posted more than 1000 photos
    '''
    
    start_time = time.time()
    shortlisted_ids = []
    for user_id in user_ids:

#         Obtain the metadata related to a Flicker's user photographs
        try:
            photos_dictionary = flickr.people.getInfo(user_id=user_id["owner"])["person"]["photos"]
        except:
            pass
    
#     Retrieve number of pictures
        num_pictures = photos_dictionary["count"]["_content"]
    
#     Retrieve date of first photo as a string and convert to date object
        date_str = photos_dictionary["firstdatetaken"]["_content"]

        if date_str != None:
            if date_str.split('-')[0] != '0000':
                first_photo_date = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
            elif date_str.split('-')[0] == '0000':
                print("Year is 0000")
        elif date_str == None:
            print("Date field is empty")
            
        
        two_years_ago = datetime.now() - relativedelta(years=2)
    #     If more than 2 years AND > 1000 photos, add user_id
        
        if two_years_ago > first_photo_date and num_pictures > 1000:
            shortlisted_ids.append(user_id)
    print("--- %s seconds ---" % (time.time() - start_time))
    print(f"Shortlisted_ids: {len(shortlisted_ids)}")
    return shortlisted_ids
    

In [None]:
def globetrotters(all_tags, file_name):
    '''
    Input:
    1) A list of tags
    2) A string to name your .json file
    
    Output:
    1) A list of unique user ids with more than 1,000 photos and who have been on Flickr for > 2 years
    2) A .json file containing all the user's information according to flickr's API (flickr.people.getInfo)
    '''
    
    finalised_user_info = []
    finalised_user_ids = []
    finalised_user_ids_sorted = []
#     Loop through all tags

    
# 1st way of retrieving user ids: By selecting a few tags related to travel and automatically finding related tags and photos with these tags
# Obtain the user_ids from these pictures
    for tag in all_tags:
        
#         To allow the loop to continue running in the event any tag causes any error in one of the 3 functions
        try:
            top3_tags = get_top3_tags(tag)

            user_ids = get_user_ids(tag, top3_tags)
            
#             user_ids = user_ids_1 + user_ids_from_groups
            
            shortlisted_ids = filter_users(user_ids)

    #         If user_id has NOT already been appended, then append to finalised_user_ids
            for potential_user_id in shortlisted_ids:
                if not any(user['owner'] == potential_user_id['owner'] for user in finalised_user_ids):
                    finalised_user_ids.append(potential_user_id)

            print(f'Total user_ids: {len(finalised_user_ids)}')
            
        except:
            print('Error. Moving over to next loop.')
            pass

#         Get all user_info from the respective user_ids
    for user_id in finalised_user_ids:
        try:
            finalised_user_info.append(flickr.people.getInfo(user_id=user_id["owner"]))
        except:
            pass
    
#     Sort user_info
    finalised_user_info_sorted = sorted(finalised_user_info, key = lambda i : i["person"]["photos"]["count"]["_content"], reverse = True)
# Sort user_ids according to the sorted user_info above
    for user in finalised_user_info:
        for user_id in finalised_user_ids:
            if user_id["owner"] == user["person"]["id"]:
                finalised_user_ids_sorted.append(user_id)

        
    with open(f'{file_name}.json', 'w') as json_file:
        json.dump(finalised_user_info, json_file)

    return finalised_user_ids_sorted, finalised_user_info_sorted
#     finalised_user_info = list(map(lambda x : flickr.people.getInfo(user_id=x), finalised_user_ids))                              


In [None]:
# backpacker, tourist, vacation, sightseeing, scenery, holiday resort, excursion, hiking, cruise, globetrotter, adventurer, mountaineer, hotel, amusement park, ryokan, festival, carnival
# The list of tags manually determined by a person to obtain related photos
main_tags = ['backpacker', 'tourist', 'vacation', 'sightseeing', 'scenery', 'holiday resort', 'excursion', 'hiking', 'cruise', 'globetrotter', 'adventurer', 'mountaineer', 'hotel', 'amusement park', 'ryokan', 'festival', 'carnival']
main_tags_2 = ['backpacker', 'tourist', 'adventurer', 'globetrotter', 'mountaineer', 'traveller']

def get_related_tags(main_tags):
    '''
    Returns top 10 related words according to datamuse API
    '''
    api = datamuse.Datamuse()
    all_tags = []
    for tag in main_tags:
        words = api.words(ml = tag, max=10)
        for word in words:
            all_tags.append(word.get('word'))
            
#     Remove duplicates from list
    unique_tags = list(set(all_tags))
    print(f'Number of tags: {len(unique_tags)}')
    return unique_tags

In [None]:
# Obtain all_tags
all_tags = get_related_tags(main_tags_2)

In [None]:
# Obtain list of user_ids + user_info from all_tags (This takes awhile, like ~15-30min?)
finalised_user_ids, finalised_user_info = globetrotters(all_tags, "user_info_tags_2")

In [None]:
len(finalised_user_ids)

In [None]:
def retrieve_relevant_headers(user_ids, user_info_list, file_name):
    '''
    Input: 
    - A list of user_ids. Each user_id should be a dictionary returned from globetrotters()
    - A list of user_info. Each user_info should be a dictionary returned from globetrotters()
    - Name of file. Note: MUST end in .csv as it has to be a .csv file!
    
    Output: A table with the following headers
    - main_tag
    - cluster_tags
    - user_id
    - username
    - location
    - num_photos
    - date_of_first_photo
    '''
    f = csv.writer(open(f"{file_name}.csv", "w+", encoding = "utf-8"))



    # Write CSV Header, If you dont need that, remove this line
    f.writerow(["main_tag", "cluser_tags", "user_id", "username", "location", "num_photos", "date_of_first_photo"])

    for i, el in enumerate(user_ids):
    #     Try retrieving location as location may be None
        try:
            location = user_info_list[i]["person"]["location"]["_content"]
        except:
            location = None
        f.writerow([user_ids[i]["main_tag"],
                    user_ids[i]["cluser_tags"],
                    user_ids[i]["owner"],
                    user_info_list[i]["person"]["username"]["_content"],
                    location,
                    user_info_list[i]["person"]["photos"]["count"]["_content"],
                    user_info_list[i]["person"]["photos"]["firstdatetaken"]["_content"]])

In [None]:
retrieve_relevant_headers(finalised_user_ids, finalised_user_info, "Relevant User Info 2")

In [None]:
# Can be ignored:
# Just to dump user_info_from_groups into a separate .json file because there are too many user_ids from groups
# unique_ids_from_group
# finalised_user_info_from_groups = []
# for user_id in shortlisted_ids_2:
#     try:
#         finalised_user_info_from_groups.append(flickr.people.getInfo(user_id=user_id))
#     except:
#         print(f'''
#         Error getting user info.
#         Current user_info length: {len(finalised_user_info_from_groups)}
#         ''')
#         pass

# with open(f'user_info_from_3_travel_groups.json', 'w') as json_file:
#     json.dump(finalised_user_info_from_groups, json_file)

In [None]:
# Obtain user_ids from .txt file
def retrieve_user_ids(path_as_string):
    a_file = open(path_as_string, "r")

    list_of_user_ids = []
    for line in a_file:

        stripped_line = line.strip()
        list_of_user_ids.append(stripped_line)

    a_file.close()
    return list_of_user_ids

In [None]:
# list_of_user_ids = retrieve_user_ids(path_as_string)

In [None]:
def retrieve_location(photo, is_free = True, google_api_key = ''):
    '''
    Retrieves location of a single photo via Nominatim (OpenStreetMaps)
    Note: Choice of geocoder (A) Nominatim & OpenStreetMaps (B) Google Maps
    If (A), it is slower.
    If (B), requires Google API Key
    '''
    lat = photo.get("latitude")
    lon = photo.get("longitude")
    coordinates = f"{lat}, {lon}"
    
#     If image is NOT geotagged, return NIL
    if lat == 0 and lon == 0:
        return "NIL"
    
#     Check if using free geocoder or not
    if is_free:
        locator = Nominatim(user_agent="myGeocoder", timeout = 10)
    else:
#         Checks for API Key
        if google_api_key != '':
            locator = GoogleV3(api_key=google_api_key)
        else:
            raise Exception("Please fill in Google API Key")
    location = locator.reverse(coordinates, exactly_one = True)
    
    if location != None:
        return str(location)
    else:
        print("Location field is empty")
        return "NIL"

In [None]:
def retrieve_images_metadata(user_ids, has_geo = True, is_free = True, google_api_key = ''):
    '''
    Input: A list of user_ids
    Output: A list of photo metadata
    
    Note: 
    1) Choice of geocoder (A) Nominatim & OpenStreetMaps (B) Google Maps
        If (A), it is slower.
        If (B), requires Google API Key
    2) If has_geo = True, returns all photos which have been geo-tagged. Some location fields may still be blank.
    If has_geo = False, returns all photos regardless of whether it has been geo-tagged.
    '''
#     To use flickr.walk output needs to be in 'etree', not 'parsed-json'
    start_time = time.time()
    flickr = flickrapi.FlickrAPI(api_key, secret, format = 'etree')
    
#     Each user_id is a dictionary containing user_id and images_metadata
#     Array of user_ids. Each user_id is a dictionary?
    
#     A list of all user's dictionary
    metadata_list = []
    data = {}
    for user_id in user_ids:
#         A dictionary for 1 user containing the user's images' metadata
        metadata_per_user = {"user_id": user_id, "images_metadata": []}
        if has_geo:
            for photo in flickr.walk(user_id = user_id, has_geo = "1", extras = "geo, url_c, url_l"):
                data["url_1024"] = photo.get("url_l")
                data["location"] = retrieve_location(photo, is_free, google_api_key)
                metadata_per_user["images_metadata"].append(data.copy())
        else:
            for photo in flickr.walk(user_id = user_id, extras = "geo, url_c, url_l"):
                data["url_1024"] = photo.get("url_l")
                data["location"] = retrieve_location(photo, is_free, google_api_key)
                metadata_per_user["images_metadata"].append(data.copy())
        metadata_list.append(metadata_per_user)
        print(f"Total number of photos for {user_id}: {len(metadata_list[-1]['images_metadata'])}")
            
    time_taken = round(time.time() - start_time, 2)
    print(f'''
    Total number of users: {len(metadata_list)}
    Time Taken: {time_taken}s
    ''')
    
    flickr = flickrapi.FlickrAPI(api_key, secret, format = 'parsed-json')
    return metadata_list

def download_images(folder_name, metadata_list):
    '''
    Input: metadata_list from the output of retrieve_images_metadata
    Output: Downloads images as .jpg into a folder labelled 'folder_name' which will be found within an 'images' folder
    '''
    
    for i, photo in enumerate(metadata_list):
        # Download image from the url and save it to '00001.jpg'
        directory = os.path.join(f"{os.getcwd()}", "images", f"{folder_name}")
        if not os.path.exists(directory):
            os.mkdir(directory)
        if photo["url_1024"] != None:
            try:
                urllib.request.urlretrieve(photo["url_1024"], f"{directory}\\{i}.jpg")
            except Exception as e:
                print(f"Could not retrieve image, {str(e)}")


In [None]:
# NOT IN USE: Replaced by flickr.walk
# Retrieve all photos with geo tag from page 1
# Note: For geo queries, only 250 photos can be returned at a time
# first_page = flickr.photos.search(user_id = "11912022@N03", has_geo = "1", extras = "geo, url_c, url_l")

# Retrieve number of pages
# num_pages_of_photos = first_page["photos"]["pages"]
# photos_per_page = first_page["photos"]["photo"]

# For each page of photos

# NOT IN USE: Merged with retrieve_images_metadata
# def retrieve_list_of_locations(list_of_user_ids, is_free = True, google_api_key = ''):
#     flickr = flickrapi.FlickrAPI(api_key, secret, format = 'etree')
#     locations = []
#     for user_id in list_of_user_ids:
# #         first_page = flickr.photos.search(user_id = user_id, has_geo = "1", extras = "geo, url_c, url_l")
# #         num_pages_of_photos = first_page["photos"]["pages"]
# #         photos_per_page = first_page["photos"]["photo"]
        
#         for photo in flickr.walk(user_id = "11912022@N03", has_geo = "1", extras = "geo, url_c, url_l"):
#             locations.append(retrieve_location(photo, is_free, google_api_key))
            
#     print(f"Total number of locations: {len(locations)}")
#     flickr = flickrapi.FlickrAPI(api_key, secret, format = 'parsed-json')
#     return locations
        

In [None]:
# flickr.favorites.getList(user_id = "185575499@N08")
# group_ids = ['1249780@N23', '2620781@N24', '853792@N24']

# NOT IN USE: For testing purposes
# download_images('Paris', photo_data)
# flickr.photos.search(user_id = "11912022@N03", has_geo = "1", extras = "geo, url_c, url_l")
# user_id = first_page["photos"]["photo"][0]["owner"]
# def flickrwalk():
#     photos = []
#     flickr = flickrapi.FlickrAPI(api_key, secret, format = 'etree')
#     for photo in flickr.walk(user_id = "89749977@N00", has_geo = "1", extras = "geo, url_c, url_l"):
#         photos.append(photo.get("url_l"))
#     print(len(photos))
#     flickr = flickrapi.FlickrAPI(api_key, secret, format = 'etree')
#     return photos

# trial = flickr.photos.search(user_id = "36891690@N06", has_geo = "1", extras = "geo, url_c, url_l")

In [None]:
# NOT IN USE: Sample code to download images
# import flickrapi
# import urllib
# from PIL import Image

# # Flickr api access key 
# flickr=flickrapi.FlickrAPI('c6a2c45591d4973ff525042472446ca2', '202ffe6f387ce29b', cache=True)


# keyword = 'siberian husky'

# photos = flickr.walk(text=keyword,
#                      tag_mode='all',
#                      tags=keyword,
#                      extras='url_c',
#                      per_page=100,           # may be you can try different numbers..
#                      sort='relevance')

# urls = []
# for i, photo in enumerate(photos):
#     print (i)
    
#     url = photo.get('url_c')
#     urls.append(url)
    
#     # get 50 urls
#     if i > 50:
#         break

# print (urls)

# # Download image from the url and save it to '00001.jpg'
# urllib.urlretrieve(urls[1], '00001.jpg')

# # Resize the image and overwrite it
# image = Image.open('00001.jpg') 
# image = image.resize((256, 256), Image.ANTIALIAS)
# image.save('00001.jpg')