# Import the data

In [1]:
import pandas as pd
from urllib import request
from os import listdir
from tqdm import tqdm

## Aggregate metadata on species

The metadata is available on eBird as a csv file for each species. We restrict here to observations made in Switzerland and that are associated with pictures. We selected the species that have the highest number of pictures.

In [2]:
#load the metadata for each species
ac = pd.read_csv("data/metadata/ML_2020-02-01T13-00_Aquila_chrysaetos_Photo_CH.csv")
mm = pd.read_csv("data/metadata/ML_2020-02-01T17-27_Milvus_milvus_Photo_CH.csv")
pc = pd.read_csv("data/metadata/ML_2020-02-02T10-28_Podiceps_cristatus_Photo_CH.csv")
fa = pd.read_csv("data/metadata/ML_2020-02-02T10-31_Fulica_atra_Photo_CH.csv")
cr = pd.read_csv("data/metadata/ML_2020-02-02T10-32_Chroicocephalus_ridibundus_Photo_CH.csv")
nr = pd.read_csv("data/metadata/ML_2020-02-02T10-33_Netta_rufina_Photo_CH.csv")
co = pd.read_csv("data/metadata/ML_2020-02-02T10-34_Cygnus_olor_Photo_CH.csv")
pg = pd.read_csv("data/metadata/ML_2020-02-02T10-36_Pyrrhocorax_graculus_Photo_CH.csv")
lm = pd.read_csv("data/metadata/ML_2020-02-02T10-37_Larus_michahellis_Photo_CH.csv")
me = pd.read_csv("data/metadata/ML_2020-02-02T10-38_Mergus_merganser_Photo_CH.csv")
af = pd.read_csv("data/metadata/ML_2020-02-02T10-39_Aythya_fuligula_Photo_CH.csv")
bb = pd.read_csv("data/metadata/ML_2020-02-02T10-40_Buteo_buteo_Photo_CH.csv")
ar = pd.read_csv("data/metadata/ML_2020-02-02T10-42_Ardea_cinerea_Photo_CH.csv")
tm = pd.read_csv("data/metadata/ML_2020-02-04T07-21_Turdus_merula_Photo_CH.csv")

In [3]:
#concatenate dataframes
data = pd.concat([ac,mm,pc,fa,cr,nr,co,pg,lm,me,af,bb,ar,tm], axis=0)

We must keep in mind that once we have concatenated 14 dataframes, we now have 14 observations whose index $\color{green}{\text{name}}$ is 0. It means that if we run ```data.loc[0,:]```, we get the first observation of each original dataframe. However, if we run ```data.iloc[0,:]```, we get the first observation of the new concatenated dataframe. Since this feature may be useful, we refrain from reindexing.

In [4]:
#get an overview on the variables
data.columns

Index(['ML Catalog Number', 'Format', 'Scientific Name', 'Common Name',
       'Background Species', 'Recordist', 'Date', 'Year', 'Month', 'Day',
       'Time', 'Country', 'Country-State-County', 'State', 'County',
       'Locality', 'Latitude', 'Longitude', 'Elevation (m)', 'Age/Sex',
       'Behaviors', 'Playback', 'Collected', 'Specimen ID',
       'Home Archive Catalog Number', 'Recorder', 'Microphone', 'Accessory',
       'Partner Institution', 'eBird Checklist ID', 'Unconfirmed?',
       'Air Temp(ºC)', 'Water Temp(ºC)', 'Comments', 'Observation Details',
       'Class', 'Order', 'Family', 'Parent Species', 'eBird Species Code',
       'Taxon Category', 'Taxonomic Sort', 'Recordist 2 Display Name',
       'Recordist 2 Profile URL', 'Average Community Rating',
       'Number of Ratings', 'Asset State', 'Asset Tags',
       'Contributor Profile URL', 'Original Image Height',
       'Original Image Width', 'Specimen Page URL', 'eBird Checklist URL',
       'ebird Species URL', 'isIn

In [5]:
#control that each species has an unique denomination under the variable 'Scientific Name'
data["Scientific Name"].unique()

array(['Aquila chrysaetos', 'Milvus milvus', 'Milvus milvus milvus',
       'Podiceps cristatus', 'Fulica atra', 'Chroicocephalus ridibundus',
       'Netta rufina', 'Cygnus olor', 'Pyrrhocorax graculus',
       'Larus michahellis', 'Larus michahellis michahellis',
       'Mergus merganser', 'Mergus merganser merganser/orientalis',
       'Aythya fuligula', 'Buteo buteo', 'Buteo buteo buteo',
       'Ardea cinerea', 'Ardea cinerea cinerea/jouyi', 'Turdus merula'],
      dtype=object)

In [6]:
#unify scientific names
data["Scientific Name"] = data["Scientific Name"].str.replace('Milvus milvus milvus', 'Milvus milvus')
data["Scientific Name"] = data["Scientific Name"].str.replace('Larus michahellis michahellis', 'Larus michahellis')
data["Scientific Name"] = data["Scientific Name"].str.replace('Mergus merganser merganser/orientalis', 'Mergus merganser')
data["Scientific Name"] = data["Scientific Name"].str.replace('Buteo buteo buteo', 'Buteo buteo')
data["Scientific Name"] = data["Scientific Name"].str.replace('Ardea cinerea cinerea/jouyi', 'Ardea cinerea')
data["Scientific Name"].unique()

array(['Aquila chrysaetos', 'Milvus milvus', 'Podiceps cristatus',
       'Fulica atra', 'Chroicocephalus ridibundus', 'Netta rufina',
       'Cygnus olor', 'Pyrrhocorax graculus', 'Larus michahellis',
       'Mergus merganser', 'Aythya fuligula', 'Buteo buteo',
       'Ardea cinerea', 'Turdus merula'], dtype=object)

In [7]:
#create an additional variable with the name of the folder in which the images are going to be stored
data["storage"] = data["Scientific Name"].str.lower()
data["storage"] = data["storage"].str.replace(" ", "_")
data["storage"] = "data/images/"+data["storage"]+"/original/"
data["storage"].unique()

array(['data/images/aquila_chrysaetos/original/',
       'data/images/milvus_milvus/original/',
       'data/images/podiceps_cristatus/original/',
       'data/images/fulica_atra/original/',
       'data/images/chroicocephalus_ridibundus/original/',
       'data/images/netta_rufina/original/',
       'data/images/cygnus_olor/original/',
       'data/images/pyrrhocorax_graculus/original/',
       'data/images/larus_michahellis/original/',
       'data/images/mergus_merganser/original/',
       'data/images/aythya_fuligula/original/',
       'data/images/buteo_buteo/original/',
       'data/images/ardea_cinerea/original/',
       'data/images/turdus_merula/original/'], dtype=object)

In [8]:
#extract Macaulay Library id of a particular observation
ml = data.iloc[0,0]
ml = str(ml)
ml

'198885451'

## Find the path to the images

We must be able to use the Macaulay Library Catalog number to retrieve the images associated with the metadata. If we search for the ML number '198885451' on the Macaulay Library website, we reach the following page : https://macaulaylibrary.org/asset/198885451

However, this page contains several images, and the image of interest is not necessarily named "198885451.jpg". It would be interesting to directly reach the page containing only the image labelled ML198885451. Inspecting the page, we can find the link we are searching for :

![inspecting the page](data/other/inspecting_Macaulay_Library.png)

This link is composed of a fixed path followed by the ML catalog number. We can use it to download the images.

In [9]:
#set the fixed part of the link
link = "https://download.ams.birds.cornell.edu/api/v1/asset/"

In [10]:
#get the path to the appropriate folder for a particular observation
data.iloc[0,-1]

'data/images/aquila_chrysaetos/original/'

In [11]:
#donwload an image with a particular ML catalog number and store it in a folder dedicated to the species
request.urlretrieve(link+ml, data.iloc[0,-1]+ml+".jpg")

('data/images/aquila_chrysaetos/original/198885451.jpg',
 <http.client.HTTPMessage at 0x108bc4400>)

## Download all images

Run only once !

### Very important : some links may have been deleted in the database

In [12]:
#set the fixed part of the link
link = "https://download.ams.birds.cornell.edu/api/v1/asset/"

#create list to collect broken links
broken_links = []
old = len(data)

#download each image and store them in the apporiate folder
start = timeit.default_timer()
for i in tqdm(range(data.shape[0])): #for the number of rows in the dataframe
    try:
        ml = str(data.iloc[i,0]) #get the ML Catalog Number
        path = data.iloc[i,-1] #set the path to the appropiate folder
        request.urlretrieve(link+ml, path+ml+".jpg") #download the image and store it in the folder
    except:
        broken_links.append(data.iloc[i,0])
        pass
    
stop = timeit.default_timer()

#drop observations with broken links
data = data[~data["ML Catalog Number"].isin(broken_links)]

print('Time: ', stop - start) 
print("Number of broken links : {}".format(len(broken_links)))
print("Original number of observations : {}".format(old))
print("Updated number of observations : {}".format(len(data)))

100%|██████████| 3347/3347 [39:53<00:00,  1.40it/s]

Time:  2393.4517323109994
Number of broken links : 9
Original number of observations : 3347
Updated number of observations : 3338





## Check exhaustivity

We would like to check if all image have been imported correctly.

In [12]:
#get the number of observations present in the dataframe for each folder
data.groupby(by="storage")["ML Catalog Number"].count()

storage
data/images/aquila_chrysaetos/original/              95
data/images/ardea_cinerea/original/                 201
data/images/aythya_fuligula/original/               219
data/images/buteo_buteo/original/                   214
data/images/chroicocephalus_ridibundus/original/    318
data/images/cygnus_olor/original/                   288
data/images/fulica_atra/original/                   322
data/images/larus_michahellis/original/             241
data/images/mergus_merganser/original/              225
data/images/milvus_milvus/original/                 211
data/images/netta_rufina/original/                  303
data/images/podiceps_cristatus/original/            340
data/images/pyrrhocorax_graculus/original/          255
data/images/turdus_merula/original/                 115
Name: ML Catalog Number, dtype: int64

In [13]:
#get the number of observations actually present in each folder
for folder in data["storage"].unique(): #for each unique folder path
    print(folder, len(listdir(folder))) #print the folder path name and the number of files in the folder

data/images/aquila_chrysaetos/original/ 96
data/images/milvus_milvus/original/ 212
data/images/podiceps_cristatus/original/ 340
data/images/fulica_atra/original/ 323
data/images/chroicocephalus_ridibundus/original/ 318
data/images/netta_rufina/original/ 303
data/images/cygnus_olor/original/ 288
data/images/pyrrhocorax_graculus/original/ 256
data/images/larus_michahellis/original/ 241
data/images/mergus_merganser/original/ 225
data/images/aythya_fuligula/original/ 219
data/images/buteo_buteo/original/ 214
data/images/ardea_cinerea/original/ 201
data/images/turdus_merula/original/ 115


The number of files got with ```listdir``` happens to be different for some folders. However, it doesn't mean that the actual number of files is not the same. Indeed, ```listdir``` returns a list that sometimes contains some ```'.ipynb_checkpoints'``` entries.

In order to check if all files have been correctly downloaded, we will aggregate lsidir entries of all folder and delete the ```'.ipynb_checkpoints'```. Then, we will extract the ML Catalog Numbers from the files' names and compare it with the ones of the observations in the metadata dataframe.

In [14]:
folder_images = []
for folder in data["storage"].unique(): #for each unique folder path
    folder_images.append(listdir(folder)) #append the listdir output of each folder to the list
    
#the result is a hierarchical list

In [15]:
#turn the list to a dataframe and flatten the hierarchy
folder_images = pd.DataFrame(folder_images).stack()

In [16]:
#keep only .jpg files in the Series
folder_images = folder_images[folder_images.str.contains(".jpg")]

#delete .jpg suffix to get ML Catalog Number
folder_images = folder_images.str.replace(".jpg", "")

In [17]:
#turn ML numbers from folders into a set
folder_images = set(folder_images.astype(int))

#turn ML numbers of metadata into a set
metadata_images = set(data["ML Catalog Number"])

In [18]:
#check if all observations present in the metadata are also present in folders
metadata_images.difference(folder_images)

set()

In [19]:
#check if all observations present in folders are also present in the metadata
folder_images.difference(metadata_images)

set()

As expected $(A\setminus B = \emptyset) \land (B\setminus A = \emptyset) \Leftrightarrow A = B$. The two sets are identic, which mean that all images have been downloaded correctly.

## Store aggregated metadata

Run only once !

In [20]:
#add ML number to the "storage" variable
data["storage"] = data["storage"].str.cat(data["ML Catalog Number"].astype(str))

#add .jpg suffix to the "storage" variable
data["storage"] = data["storage"]+".jpg"

data.iloc[0,:]["storage"]

'data/images/aquila_chrysaetos/original/198885451.jpg'

In [21]:
#write csv file
data.to_csv("data/metadata/aggregate.csv")