# __Camino de Santiago__ 
## Pilgrimage Data Retrieveal from the Cathedral RESTful Server 
### Notebook uses async calls to avoid DoS defensive timeouts by the Server



#### __Step 1:__ Load libraries

In [1]:

import time            ### To work with time objects
import json            ### To Process returned JSON data
import pandas as pd    ### To work with dataframes
import asyncio         ### To make async request
import httpx           ### To make request to Cathedral URL
import datetime        ### To work with datetime
import openpyxl        ### To same to Excel files


#### __Step 2:__ Initialize Panda dataframes to be filled

In [3]:
############## Dataframes to fill
 
tablelist = [
    'Anho',
    'Mes',
    'Identificador',
    'TotalRegistros',
    'Autonomias',
    'Caminos',
    'Continentes',
    'Edades',
    'Grupos',
    'Medios',
    'Motivos',
    'Paises',
    'Procedencias',
    'Sexos'
]
totals_df = pd.DataFrame(
    columns=['Anho', 'Mes', 'Identificador', 'TotalRegistros'])
autonomous_coms_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
routes_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
continents_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
ages_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
groups_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
transportation_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
motives_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
countries_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
origin_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
gender_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])


#### __Step 3:__ Define functions that calculate the valid range of dates that may be retrieved from the server

In [4]:
############# Function that calculates the range of valid months
def range_of_months(start_date, end_date):
    months = []

    for i in range(start_date.year * 12 + start_date.month, end_date.year*12+end_date.month):
        #months.append(datetime.date((i-13) // 12 + 1, (i-1) % 12 + 1, 1))
        date_iter = datetime.date((i-13) // 12 + 1, (i-1) % 12 + 1, 1)
        year = date_iter.year
        month = date_iter.month
        months.append([year, month])
    return months


data_start_date = datetime.date(2004, 1, 1) #start date is January 2004
data_end_date = datetime.datetime.now()  # datetime.date(2011, 2, 1)
year_months = []
year_months = range_of_months(data_start_date, data_end_date)
#print(year_months)

#### __Step 4:__ Define functions that concatenate newly retrieved data to previously retrieved data

In [6]:
def add_prefix(idx, yr, mnth, df):
    df.insert(0, "id", idx)
    df.insert(1, "year", yr)
    df.insert(2, "month", mnth)
    return df


def add_to_dataframe(df_running_totals, yr, mnth, df_to_add):
    #print('dataframe1:',df1)
    #print('dataframe2:', df2)
    idx = str(yr)+str(mnth).zfill(2)
    add_prefix(idx, yr, mnth, df_to_add)
    frames = [df_running_totals, df_to_add]
    tempdf = pd.concat(frames, ignore_index=True)
    #print('temp',tempdf)

    return tempdf




#### __Step 5:__ Defines function that splits and converts retrieved json data to various the dataframes

In [7]:
############# Async function that requests data from the Cathedral's server ################
############# waits for response from server before it continues with requests to avoid empty responses (a "denial of service" defense mechanism? )


async def get_data(mes, anho):
    eMes = str(mes)
    eAnho = str(anho)
    url = 'https://catedral.df-server.info/ws/wsCatedral.asmx/ObtenerEstadisticasMes?eAnho=' + eAnho + '&eMes=' + eMes
    # print("mes:",mes,"anho:",anho,"url:",url)
    client = httpx.AsyncClient()
    async with client.stream('GET', url) as response:
        async for chunk in response.aiter_text():
           # print("Chunk:",chunk)
            if len(chunk) > 0:
                json_data_chunk = json.loads(chunk)
                totals_row = [anho, mes, json_data_chunk['Identificador'],json_data_chunk['TotalRegistros']]
                global totals_df
                totals_df.loc[len(totals_df.index)] = totals_row

                # Concatenate new results and running total dataframes, split JSON by key
                global autonomous_coms_df
                autonomous_coms_df=add_to_dataframe(autonomous_coms_df, anho, mes, pd.DataFrame(json_data_chunk['Autonomias']))
                global routes_df
                routes_df=add_to_dataframe(routes_df, anho, mes, pd.DataFrame(json_data_chunk['Caminos']))
                global continents_df
                continents_df=add_to_dataframe(continents_df, anho, mes, pd.DataFrame(json_data_chunk['Continentes']))
                global ages_df
                ages_df=add_to_dataframe(ages_df, anho, mes, pd.DataFrame(json_data_chunk['Edades']))
                global groups_df
                groups_df=add_to_dataframe(groups_df, anho, mes, pd.DataFrame(json_data_chunk['Grupos']))
                global transportation_df
                transportation_df=add_to_dataframe(transportation_df, anho, mes, pd.DataFrame(json_data_chunk['Medios']))
                global motives_df
                motives_df=add_to_dataframe(motives_df, anho, mes, pd.DataFrame(json_data_chunk['Motivos']))
                global countries_df
                countries_df=add_to_dataframe(countries_df, anho, mes, pd.DataFrame(json_data_chunk['Paises']))
                global origin_df
                origin_df=add_to_dataframe(origin_df, anho, mes, pd.DataFrame(json_data_chunk['Procedencias']))
                global gender_df
                gender_df=add_to_dataframe(gender_df, anho, mes, pd.DataFrame(json_data_chunk['Sexos']))

                print('Mes:', eMes, 'Anho:', eAnho)
                #print("start:", json_data_chunk, ":end", len(chunk))
            else:
                break
    #async with httpx.AsyncClient() as client:
    #    response = await client.stream('GET',url)
    #    print(response)

    ######### TEST ###############
    """ 
    for yAnho in range(2004,2005):
        for xMes in range(1,12):
            await get_data(xMes,yAnho,)
    """    
    ######### TEST END ###########  


data_start_date = datetime.date(2004, 1, 1)
#data_end_date = datetime.date(2004, 3, 1)
data_end_date = datetime.datetime.now()  # datetime.date(2011, 2, 1)
year_months = []
year_months = range_of_months(data_start_date, data_end_date)
#print(year_months)


for year_month in range(len(year_months)):
    await get_data(year_months[year_month][1],year_months[year_month][0])

  

Mes: 1 Anho: 2004
Mes: 2 Anho: 2004
Mes: 3 Anho: 2004
Mes: 4 Anho: 2004
Mes: 5 Anho: 2004
Mes: 6 Anho: 2004
Mes: 7 Anho: 2004
Mes: 8 Anho: 2004
Mes: 9 Anho: 2004
Mes: 10 Anho: 2004
Mes: 11 Anho: 2004
Mes: 12 Anho: 2004
Mes: 1 Anho: 2005
Mes: 2 Anho: 2005
Mes: 3 Anho: 2005
Mes: 4 Anho: 2005
Mes: 5 Anho: 2005
Mes: 6 Anho: 2005
Mes: 7 Anho: 2005
Mes: 8 Anho: 2005
Mes: 9 Anho: 2005
Mes: 10 Anho: 2005
Mes: 11 Anho: 2005
Mes: 12 Anho: 2005
Mes: 1 Anho: 2006
Mes: 2 Anho: 2006
Mes: 3 Anho: 2006
Mes: 4 Anho: 2006
Mes: 5 Anho: 2006
Mes: 6 Anho: 2006
Mes: 7 Anho: 2006
Mes: 8 Anho: 2006
Mes: 9 Anho: 2006
Mes: 10 Anho: 2006
Mes: 11 Anho: 2006
Mes: 12 Anho: 2006
Mes: 1 Anho: 2007
Mes: 2 Anho: 2007
Mes: 3 Anho: 2007
Mes: 4 Anho: 2007
Mes: 5 Anho: 2007
Mes: 6 Anho: 2007
Mes: 7 Anho: 2007
Mes: 8 Anho: 2007
Mes: 9 Anho: 2007
Mes: 10 Anho: 2007
Mes: 11 Anho: 2007
Mes: 12 Anho: 2007
Mes: 1 Anho: 2008
Mes: 2 Anho: 2008
Mes: 3 Anho: 2008
Mes: 4 Anho: 2008
Mes: 5 Anho: 2008
Mes: 6 Anho: 2008
Mes: 7 Anho: 200

#### __Step 6:__ Rename columns to English equivalents

In [8]:
column_names = {
    "id": "ID",  "year": "Year", "month": "Month", "Nombre": "Description", "Total": "Total",  "Porcentaje": "Percent"}
autonomous_coms_df.rename(columns=column_names, inplace=True)
routes_df.rename(columns=column_names, inplace=True)
continents_df.rename(columns=column_names, inplace=True)
ages_df.rename(columns=column_names, inplace=True)
groups_df.rename(columns=column_names, inplace=True)
transportation_df.rename(columns=column_names, inplace=True)
motives_df.rename(columns=column_names, inplace=True)
countries_df.rename(columns=column_names, inplace=True)
origin_df.rename(columns=column_names, inplace=True)
gender_df.rename(columns=column_names, inplace=True)


#### __Step 7:__ Totals has an error, use genders to generate a true total using Pandas groupby with the sum() function

In [9]:
totals_true_df=gender_df.groupby(['ID','Year','Month'])['Total'].sum()


#### __Step 8:__ Output Excel files

In [11]:
### Path to use to save the dataframes to Excel files  ###
save_path="C:/Sites/CaminoAwait/Data/"

totals_df.to_excel(save_path+"camino_totals.xlsx", engine="openpyxl")
autonomous_coms_df.to_excel(save_path+"camino_autonomous_comm.xlsx", engine="openpyxl")
routes_df.to_excel(save_path+"camino_routes.xlsx", engine="openpyxl")
continents_df.to_excel(save_path+"camino_continents.xlsx", engine="openpyxl")
ages_df.to_excel(save_path+"camino_ages.xlsx", engine="openpyxl")
groups_df.to_excel(save_path+"camino_groups.xlsx", engine="openpyxl")
transportation_df.to_excel(save_path+"camino_transportation.xlsx", engine="openpyxl")
motives_df.to_excel(save_path+"camino_motives.xlsx", engine="openpyxl")
countries_df.to_excel(save_path+"camino_countries.xlsx", engine="openpyxl")
origin_df.to_excel(save_path+"camino_origin.xlsx", engine="openpyxl")
gender_df.to_excel(save_path+"camino_gender.xlsx", engine="openpyxl")
totals_true_df.to_excel(save_path+"camino_totals_true.xlsx", engine="openpyxl")

#### __END__