## Few definitions:
1. __Study__: A particular scientific study stored on Movebank's database
2. __Individual__: Used to refer to a particular animal (not species, a single unique animal). It has no existence without study
3. __Event__: This refers to a particular log of location (latitude and longitude) of a particular individual. It has no existence without individual

In [19]:
import os
import csv
import hashlib
import io
import pickle
from pprint import pprint
import pandas as pd
import requests
mbauth=(os.environ['mbuser'], os.environ['mbpwd'])

### __callMovebankAPI__: Utility function to request something from Movebank API
#### Parameters:
1. `params`: Tuple of tuples specifying the contents of the request
    * `entity-type`: Type of entity (database entity) to get records of. This can be 'study', 'individual', 'event', 'tag' et cetera.
    * `study_id`: The id of the study to which the data requested belongs. Required for individuals and events
    * Other parameters of the form `('paramName', paramValue)`

#### Returns: 
String containing content of response to GET request

In [28]:
def callMovebankAPI(params):
    print('Requesting...', params)
    try:
        response = requests.get('https://www.movebank.org/movebank/service/direct-read', params = params, auth = mbauth, timeout=(3.05, 21))
    except:
        print(f'Exception occured for params:\n{params}')
        return 'ERR'
    print('Request response:', response.url)
    if response.status_code == 200:
        if 'License Terms:' in str(response.content):
            print('License terms')
            hash = hashlib.md5(response.content).hexdigest()
            params = params + (('license-md5', hash),)
            try:
                response = requests.get('https://www.movebank.org/movebank/service/direct-read', params=params, cookies=response.cookies, auth=mbauth, timeout=(3.05, 21))
            except:
                print(f'Exception occured for params:\n{params}')
                return 'ERR'
            if response.status_code == 403:  # incorrect hash
                print("Incorrect hash")
                return ''
        return response.content.decode('utf-8')
    print(str(response.content))
    return 'ERR'

### __getStudies__: Function to get all records of all studies
#### Returns:
List of OrderedDicts, each containing one record

In [29]:
def getStudies():
	studies = callMovebankAPI((('entity_type', 'study'), ('i_can_see_data', 'true'), ('there_are_data_which_i_cannot_see', 'false')))
	if studies != 'ERR' and len(studies) > 0:
		studies = csv.DictReader(io.StringIO(studies), delimiter=',')
		return [s for s in studies if s['i_can_see_data'] == 'true' and s['there_are_data_which_i_cannot_see'] == 'false']
	return []

### __getStudyIndividuals__: Function to get records of all individuals from a particular study
#### Parameters:
1. `study_id`

#### Returns:
List of OrderedDicts, each containing one record

In [30]:
def getStudyIndividuals(study_id):
    individuals = callMovebankAPI((('entity_type', 'individual'),('study_id', study_id)))
    if individuals != 'ERR' and len(individuals) > 0:
        return list(csv.DictReader(io.StringIO(individuals), delimiter=','))
    return []

### __getEventsOfIndividual__: Function to get records of all events related to one particular individual from one particular study
#### Parameters:
1. `study_id`
2. `individual_id`

#### Returns:
List of OrderedDicts, each containing one record

In [31]:
def getEventsOfIndividual(study_id, individual_id):

    params = (('entity_type', 'event'), ('study_id', study_id),
              ('individual_id', individual_id))
    events = callMovebankAPI(params)
    if events != 'ERR' and len(events) > 0:
        return list(csv.DictReader(io.StringIO(events), delimiter=','))
    return []

### __getEventsOfStudy__: Function to get records of all events of a particular study
#### Parameters:
1. `study_id`

#### Returns:
List of OrderedDicts, each containing one record

In [56]:
def getEventsOfStudy(study_id):

    params = (('entity_type', 'event'), ('study_id', study_id))
    events = callMovebankAPI(params)
    print('Event string:', events[:100])
    if events != 'ERR' and len(events) > 0:
        return list(csv.DictReader(io.StringIO(events), delimiter=','))
    print(f'ERR for {study_id}')
    return []

In [None]:
studies = pd.DataFrame(getStudies())

In [36]:
studies = studies[['id', 'acknowledgements', 'taxon_ids', 'sensor_type_ids', 'number_of_individuals']]
studies.head()

Unnamed: 0,id,acknowledgements,taxon_ids,sensor_type_ids,number_of_individuals
0,7006760,,,Argos Doppler Shift,244.0
1,5636685,"Julius Morkunas (captures, logistics, surgerie...","Gavia stellata,Clangula hyemalis,Melanitta fusca",Argos Doppler Shift,17.0
2,910184675,,,,
3,74496970,,Ciconia ciconia,"GPS,Acceleration",72.0
4,492670611,,"Cygnus columbianus,Anser albifrons,Anser fabal...",GPS,228.0


In [37]:
main_table = studies['id'].copy()
print(len(main_table))

688


In [None]:
individual_dicts = []
for i in [i for i in main_table][:50]:
    response = getStudyIndividuals(i)
    for d in response:
        d['study_id'] = i
    individual_dicts += response
individuals_table = pd.DataFrame(individual_dicts)[['study_id', 'id', 'taxon_canonical_name', 'sex']]
individuals_table.head()

In [79]:
individuals_table = individuals_table.rename(columns={'id': 'individual_id'})
individuals_table.head()

Unnamed: 0,study_id,individual_id,taxon_canonical_name,sex
0,74496970,75618402,Ciconia ciconia,
1,74496970,75618403,Ciconia ciconia,
2,74496970,75618404,Ciconia ciconia,
3,74496970,75618405,Ciconia ciconia,
4,74496970,75618406,Ciconia ciconia,


In [None]:
event_dicts = []
for i in [i for i in main_table][:50]:
    response = getEventsOfStudy(i)
    for d in response:
        d['study_id'] = i
    event_dicts += response
events_table = pd.DataFrame(event_dicts)

In [81]:
events_table = events_table.drop(columns=['tag_id'])
events_table.head()

Unnamed: 0,timestamp,location_lat,location_long,individual_id,study_id
0,2006-03-22 05:00:00.000,51.828,5.2705,133992600,133992043
1,2006-03-22 07:00:00.000,51.83683,6.16433,133992600,133992043
2,2006-03-22 09:00:00.000,51.96417,7.72917,133992600,133992043
3,2006-03-22 11:00:00.000,52.1975,9.39417,133992600,133992043
4,2006-03-22 13:00:00.000,52.37667,11.25267,133992600,133992043


In [83]:
main_table_df = pd.merge(individuals_table, events_table, how='inner', on=['study_id', 'individual_id'])
main_table_df.tail()

Unnamed: 0,study_id,individual_id,taxon_canonical_name,sex,timestamp,location_lat,location_long
509863,66480086,66485643,Limosa limosa,f,2014-05-30 03:21:39.000,52.635,5.109
509864,66480086,66485643,Limosa limosa,f,2014-05-30 04:24:54.000,52.635,5.11
509865,66480086,66485643,Limosa limosa,f,2014-05-30 04:46:03.000,52.615,5.05
509866,66480086,66485643,Limosa limosa,f,2014-05-30 05:27:22.000,52.641,5.068
509867,66480086,66485643,Limosa limosa,f,2014-05-30 05:27:22.000,52.639,5.082


In [89]:
main_table_df.to_csv('Data_Collected_1.csv')

In [101]:
with open('study_ids_done.p', 'wb') as f:
    pickle.dump(main_table[:50], f)
main_table[:50].to_csv('study_ids_done.csv')

In [102]:
with open('all_study_ids.p', 'wb') as f:
    pickle.dump(main_table, f)
main_table.to_csv('all_study_ids.csv')