## Few definitions:
1. __Study__: A particular scientific study stored on Movebank's database
2. __Individual__: Used to refer to a particular animal (not species, a single unique animal). It has no existence without study
3. __Event__: This refers to a particular log of location (latitude and longitude) of a particular individual. It has no existence without individual

In [2]:
import os
import csv
import hashlib
import io
import pickle
from pprint import pprint
import pandas as pd
import requests
mbauth=(os.environ['mbuser'], os.environ['mbpwd'])

### __callMovebankAPI__: Utility function to request something from Movebank API
#### Parameters:
1. `params`: Tuple of tuples specifying the contents of the request
    * `entity-type`: Type of entity (database entity) to get records of. This can be 'study', 'individual', 'event', 'tag' et cetera.
    * `study_id`: The id of the study to which the data requested belongs. Required for individuals and events
    * Other parameters of the form `('paramName', paramValue)`

#### Returns: 
String containing content of response to GET request

In [3]:
def callMovebankAPI(params):
    print('Requesting...', params)
    try:
        response = requests.get('https://www.movebank.org/movebank/service/direct-read', params = params, auth = mbauth, timeout=(3.05, 21))
    except:
        print(f'Exception occured for params:\n{params}')
        return 'ERR'
    print('Request response:', response.url)
    if response.status_code == 200:
        if 'License Terms:' in str(response.content):
            print('License terms')
            hash = hashlib.md5(response.content).hexdigest()
            params = params + (('license-md5', hash),)
            try:
                response = requests.get('https://www.movebank.org/movebank/service/direct-read', params=params, cookies=response.cookies, auth=mbauth, timeout=(3.05, 21))
            except:
                print(f'Exception occured for params:\n{params}')
                return 'ERR'
            if response.status_code == 403:  # incorrect hash
                print("Incorrect hash")
                return ''
        return response.content.decode('utf-8')
    print(str(response.content))
    return 'ERR'

### __getStudies__: Function to get all records of all studies
#### Returns:
List of OrderedDicts, each containing one record

In [4]:
def getStudies():
	studies = callMovebankAPI((('entity_type', 'study'), ('i_can_see_data', 'true'), ('there_are_data_which_i_cannot_see', 'false')))
	if studies != 'ERR' and len(studies) > 0:
		studies = csv.DictReader(io.StringIO(studies), delimiter=',')
		return [s for s in studies if s['i_can_see_data'] == 'true' and s['there_are_data_which_i_cannot_see'] == 'false']
	return []

### __getStudyIndividuals__: Function to get records of all individuals from a particular study
#### Parameters:
1. `study_id`

#### Returns:
List of OrderedDicts, each containing one record

In [5]:
def getStudyIndividuals(study_id):
    individuals = callMovebankAPI((('entity_type', 'individual'),('study_id', study_id)))
    if individuals != 'ERR' and len(individuals) > 0:
        return list(csv.DictReader(io.StringIO(individuals), delimiter=','))
    return []

### __getEventsOfIndividual__: Function to get records of all events related to one particular individual from one particular study
#### Parameters:
1. `study_id`
2. `individual_id`

#### Returns:
List of OrderedDicts, each containing one record

In [6]:
def getEventsOfIndividual(study_id, individual_id):

    params = (('entity_type', 'event'), ('study_id', study_id),
              ('individual_id', individual_id))
    events = callMovebankAPI(params)
    if events != 'ERR' and len(events) > 0:
        return list(csv.DictReader(io.StringIO(events), delimiter=','))
    return []

### __getEventsOfStudy__: Function to get records of all events of a particular study
#### Parameters:
1. `study_id`

#### Returns:
List of OrderedDicts, each containing one record

In [7]:
def getEventsOfStudy(study_id):

    params = (('entity_type', 'event'), ('study_id', study_id))
    events = callMovebankAPI(params)
    print('Event string:', events[:100])
    if events != 'ERR' and len(events) > 0:
        return list(csv.DictReader(io.StringIO(events), delimiter=','))
    print(f'ERR for {study_id}')
    return []

In [7]:
studies = pd.DataFrame(getStudies())

Requesting... (('entity_type', 'study'), ('i_can_see_data', 'true'), ('there_are_data_which_i_cannot_see', 'false'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=study&i_can_see_data=true&there_are_data_which_i_cannot_see=false


In [8]:
studies = studies[['id', 'acknowledgements', 'taxon_ids', 'sensor_type_ids', 'number_of_individuals']]
studies.head()

Unnamed: 0,id,acknowledgements,taxon_ids,sensor_type_ids,number_of_individuals
0,7006760,,,Argos Doppler Shift,244.0
1,5636685,"Julius Morkunas (captures, logistics, surgerie...","Gavia stellata,Clangula hyemalis,Melanitta fusca",Argos Doppler Shift,17.0
2,910184675,,,,
3,74496970,,Ciconia ciconia,"GPS,Acceleration",72.0
4,492670611,,"Cygnus columbianus,Anser albifrons,Anser fabal...",GPS,228.0


In [9]:
main_table = studies['id'].copy()
print(len(main_table))

685


In [8]:
done_ids = pickle.load(open('study_ids_done.p', 'rb'))
print(done_ids)

0        7006760
1        5636685
2      910184675
3       74496970
4      492670611
5      495044164
6      325569416
7        6946314
8      560041066
9       20106351
10     133992043
11     552663534
12     505156776
13     736029750
14    1070442621
15     467038889
16     178797918
17     890556697
18      49916069
19     190490326
20      20873986
21     657674643
22     926080482
23    1050539212
24     446053425
25     685709959
26     313789633
27      21231406
28     487888187
29     156746015
30       2235650
31       6956518
32        446575
33     403960582
34     625284084
35     434277141
36       9103765
37      20202974
38      49471886
39      49535504
40     178979729
41    1060597624
42      16557786
43       1924571
44      66480086
45      32430844
46     250149802
47     178994931
48     453975008
49     529744543
Name: id, dtype: object


In [9]:
all_ids = pickle.load(open('all_study_ids.p', 'rb'))
print(all_ids.head())

0      7006760
1      5636685
2    910184675
3     74496970
4    492670611
Name: id, dtype: object


In [10]:
done_ids_set = set(done_ids)
all_ids_set = set(all_ids)
remaining_ids_set = all_ids_set.difference(done_ids)
print(len(remaining_ids_set))
remaining_ids = pd.Series(list(remaining_ids_set))

638


In [11]:
today_ids = list(remaining_ids)[:50]

# THIS IS IT

In [12]:
individual_dicts = []
empty = 0
count = 0
for i in today_ids:
    print(count)
    count += 1
    response = getStudyIndividuals(i)
    if len(response) == 0:
        print('Empty response')
        empty+=1
    for d in response:
        d['study_id'] = i
    individual_dicts += response
individuals_table = pd.DataFrame(individual_dicts)[['study_id', 'id', 'taxon_canonical_name', 'sex']]

individuals_table.head()

0
Requesting... (('entity_type', 'individual'), ('study_id', '10763606'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=individual&study_id=10763606
1
Requesting... (('entity_type', 'individual'), ('study_id', '569354581'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=individual&study_id=569354581
2
Requesting... (('entity_type', 'individual'), ('study_id', '506737915'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=individual&study_id=506737915
3
Requesting... (('entity_type', 'individual'), ('study_id', '1078307997'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=individual&study_id=1078307997
Empty response
4
Requesting... (('entity_type', 'individual'), ('study_id', '1518377'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=individual&study_id=1518377
Empty response
5
Requesting... (('entity_typ

Unnamed: 0,study_id,id,taxon_canonical_name,sex
0,10763606,10792426,Ciconia ciconia,
1,10763606,10792411,Ciconia ciconia,
2,10763606,10792418,Ciconia ciconia,
3,10763606,10792386,Ciconia ciconia,
4,10763606,10792401,Ciconia ciconia,


In [13]:
print('Empty studies:', empty)

Empty studies: 25


In [14]:
individuals_table = individuals_table.rename(columns={'id': 'individual_id'})
individuals_table.head()

Unnamed: 0,study_id,individual_id,taxon_canonical_name,sex
0,10763606,10792426,Ciconia ciconia,
1,10763606,10792411,Ciconia ciconia,
2,10763606,10792418,Ciconia ciconia,
3,10763606,10792386,Ciconia ciconia,
4,10763606,10792401,Ciconia ciconia,


In [15]:
event_dicts = []
empty2 = 0
count2 = 0
for i in today_ids:
    print(count2)
    count2 += 1
    response = getEventsOfStudy(i)
    if len(response) == 0:
        empty += 1
    for d in response:
        d['study_id'] = i
    event_dicts += response
events_table = pd.DataFrame(event_dicts)

0
Requesting... (('entity_type', 'event'), ('study_id', '10763606'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=event&study_id=10763606
Event string: timestamp,location_lat,location_long,individual_id,tag_id

1
Requesting... (('entity_type', 'event'), ('study_id', '569354581'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=event&study_id=569354581
Event string: timestamp,location_lat,location_long,individual_id,tag_id

2
Requesting... (('entity_type', 'event'), ('study_id', '506737915'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=event&study_id=506737915
Event string: timestamp,location_lat,location_long,individual_id,tag_id
2002-07-05 07:48:28.000,70.006,-152.846,5
3
Requesting... (('entity_type', 'event'), ('study_id', '1078307997'))
Request response: https://www.movebank.org/movebank/service/direct-read?entity_type=event&study_id=1078307997
Event string: <p>No data

In [17]:
print("Empty=", empty)

Empty= 64


In [18]:
events_table = events_table.drop(columns=['tag_id'])
events_table.head()

Unnamed: 0,timestamp,location_lat,location_long,individual_id,study_id
0,2002-07-05 07:48:28.000,70.006,-152.846,506739543,506737915
1,2002-07-05 09:35:33.000,69.995,-152.857,506739543,506737915
2,2002-07-05 11:15:29.000,70.002,-152.86,506739543,506737915
3,2002-07-06 12:49:52.000,70.007,-152.88,506739543,506737915
4,2002-07-06 13:28:14.000,70.004,-152.878,506739543,506737915


In [19]:
main_table_df = pd.merge(individuals_table, events_table, how='inner', on=['study_id', 'individual_id'])
main_table_df.tail()

Unnamed: 0,study_id,individual_id,taxon_canonical_name,sex,timestamp,location_lat,location_long
1162778,917906795,917983448,Falco rusticolus,f,2002-10-13 20:06:19.000,65.288,-53.00200000000001
1162779,917906795,917983448,Falco rusticolus,f,2002-10-13 20:16:02.000,65.286,-52.95400000000001
1162780,917906795,917983448,Falco rusticolus,f,2002-10-13 20:17:04.000,65.304,-52.971
1162781,917906795,917983448,Falco rusticolus,f,2002-10-13 20:25:15.000,65.299,-52.92399999999998
1162782,917906795,917983448,Falco rusticolus,f,2002-10-13 21:13:49.000,65.09,-54.28499999999997


In [20]:
main_table_df.to_csv('Data_Collected_2.csv')

In [None]:
done_ids_set = done_ids_set.union(set(today_ids))

In [101]:
with open('study_ids_done.p', 'wb') as f:
    pickle.dump(main_table[:50], f)
main_table[:50].to_csv('study_ids_done.csv')

In [102]:
with open('all_study_ids.p', 'wb') as f:
    pickle.dump(main_table, f)
main_table.to_csv('all_study_ids.csv')