# __Camino de Santiago__ 
## Pilgrimage Data Retrieveal from the Cathedral RESTful Server 
### Notebook uses async calls to avoid DoS defensive timeouts by the Server


#### __Step 1:__ Load libraries

In [1]:
import asyncio
import httpx
import time
import json
import pandas as pd
import datetime
import openpyxl


#### __Step 2:__ Initialize Panda dataframes to be filled

In [32]:
############## Dataframes to fill
 
tablelist = [
    'Anho',
    'Mes',
    'Identificador',
    'TotalRegistros',
    'Autonomias',
    'Caminos',
    'Continentes',
    'Edades',
    'Grupos',
    'Medios',
    'Motivos',
    'Paises',
    'Procedencias',
    'Sexos'
]
totals_df = pd.DataFrame(
    columns=['Anho', 'Mes', 'Identificador', 'TotalRegistros'])
autonomous_coms_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
routes_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
continents_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
ages_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
groups_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
transportation_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
motives_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
countries_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
origin_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])
gender_df = pd.DataFrame(
    columns=['id', 'year', 'month', 'Nombre', 'Total', 'Porcentaje'])


#### __Step 3:__ Define functions that calculate the valid range of dates that may be retrieved from the server

In [34]:
############# Function that calculates the range of valid months
def range_of_months(start_date, end_date):
    months = []

    for i in range(start_date.year * 12 + start_date.month, end_date.year*12+end_date.month):
        #months.append(datetime.date((i-13) // 12 + 1, (i-1) % 12 + 1, 1))
        date_iter = datetime.date((i-13) // 12 + 1, (i-1) % 12 + 1, 1)
        year = date_iter.year
        month = date_iter.month
        months.append([year, month])
    return months


data_start_date = datetime.date(2004, 1, 1) #start date is January 2004
data_end_date = datetime.datetime.now()  # datetime.date(2011, 2, 1)
year_months = []
year_months = range_of_months(data_start_date, data_end_date)
#print(year_months)

#### __Step 4:__ Define functions that concatenate newly retrieved data to previously retrieved data

In [35]:
def add_prefix(idx, yr, mnth, df):
    df.insert(0, "id", idx)
    df.insert(1, "year", yr)
    df.insert(2, "month", mnth)
    return df


def add_to_dataframe(df_running_totals, yr, mnth, df_to_add):
    #print('dataframe1:',df1)
    #print('dataframe2:', df2)
    idx = str(yr)+str(mnth).zfill(2)
    add_prefix(idx, yr, mnth, df_to_add)
    frames = [df_running_totals, df_to_add]
    tempdf = pd.concat(frames, ignore_index=True)
    #print('temp',tempdf)

    return tempdf




#### __Step 5:__ Define function that splits and converts retrieved json data to various the dataframes

In [36]:



def json_to_dataframes(json_data, mes, anho,tot_df,auto_coms_df,routs_df,contis_df,ags_df,grps_df,trans_df,mots_df,countis_df,orgs_df,gens_df):
    totals_row = [anho, mes, json_data['Identificador'],
                  json_data['TotalRegistros']]

    tot_df.loc[len(tot_df.index)] = totals_row
    auto_coms_df = add_to_dataframe(
        auto_coms_df, anho, mes, pd.DataFrame(json_data['Autonomias']))
    routs_df = add_to_dataframe(
        routs_df, anho, mes, pd.DataFrame(json_data['Caminos']))
    contis_df = add_to_dataframe(
        contis_df, anho, mes, pd.DataFrame(json_data['Continentes']))
    ags_df = add_to_dataframe(
        ags_df, anho, mes, pd.DataFrame(json_data['Edades']))
    grps_df = add_to_dataframe(
        grps_df,  anho, mes, pd.DataFrame(json_data['Grupos']))
    trans_df = add_to_dataframe(
        trans_df, anho, mes,  pd.DataFrame(json_data['Medios']))
    mots_df = add_to_dataframe(
        mots_df, anho, mes,  pd.DataFrame(json_data['Motivos']))
    countis_df = add_to_dataframe(
        countis_df, anho, mes, pd.DataFrame(json_data['Paises']))
    orgs_df = add_to_dataframe(
        orgs_df, anho, mes,  pd.DataFrame(json_data['Procedencias']))
    gens_df = add_to_dataframe(
        gens_df, anho, mes, pd.DataFrame(json_data['Sexos']))


#### __Step 6:__ Defines function that splits and converts retrieved json data to various the dataframes

In [37]:
############# Async function that requests data from the Cathedral's server ################
############# waits for response from server before it continues with requests to avoid empty responses (a "denial of service" defense mechanism? )


async def get_data(mes, anho):
    eMes = str(mes)
    eAnho = str(anho)
    url = 'https://catedral.df-server.info/ws/wsCatedral.asmx/ObtenerEstadisticasMes?eAnho=' + \
        eAnho + '&eMes=' + eMes
    client = httpx.AsyncClient()
    async with client.stream('GET', url) as response:
        async for chunk in response.aiter_text():
            if len(chunk) > 0:
                json_data_chunk = json.loads(chunk)
                json_to_dataframes(json_data_chunk, mes, anho,totals_df,autonomous_coms_df, routes_df,continents_df,ages_df,groups_df,transportation_df,motives_df,countries_df,origin_df,gender_df)
                print('Mes:', eMes, 'Anho:', eAnho)
                print("start:", json_data_chunk, ":end", len(chunk))
            else:
                break
    #async with httpx.AsyncClient() as client:
    #    response = await client.stream('GET',url)
    #    print(response)

    ######### TEST ###############
    """ 
    for yAnho in range(2004,2005):
        for xMes in range(1,12):
            await get_data(xMes,yAnho,)
    """    
    ######### TEST END ###########  


data_start_date = datetime.date(2004, 1, 1)
#data_end_date = datetime.date(2004, 3, 1)
data_end_date = datetime.datetime.now()  # datetime.date(2011, 2, 1)
year_months = []
year_months = range_of_months(data_start_date, data_end_date)
#print(year_months)


for year_month in range(len(year_months)):
    print("year:", year_months[year_month][0]," month:", year_months[year_month][1])
    await get_data(year_months[year_month][0],year_months[year_month][1])

   


year: 2004  month: 1
year: 2004  month: 2
year: 2004  month: 3
year: 2004  month: 4
year: 2004  month: 5
year: 2004  month: 6
year: 2004  month: 7
year: 2004  month: 8
year: 2004  month: 9
year: 2004  month: 10
year: 2004  month: 11
year: 2004  month: 12
year: 2005  month: 1
year: 2005  month: 2
year: 2005  month: 3
year: 2005  month: 4
year: 2005  month: 5
year: 2005  month: 6
year: 2005  month: 7
year: 2005  month: 8
year: 2005  month: 9
year: 2005  month: 10
year: 2005  month: 11
year: 2005  month: 12
year: 2006  month: 1
year: 2006  month: 2
year: 2006  month: 3
year: 2006  month: 4
year: 2006  month: 5
year: 2006  month: 6
year: 2006  month: 7
year: 2006  month: 8
year: 2006  month: 9
year: 2006  month: 10
year: 2006  month: 11
year: 2006  month: 12
year: 2007  month: 1
year: 2007  month: 2
year: 2007  month: 3
year: 2007  month: 4
year: 2007  month: 5
year: 2007  month: 6
year: 2007  month: 7
year: 2007  month: 8
year: 2007  month: 9
year: 2007  month: 10
year: 2007  month: 11
ye

#### __Step 7:__ Output Excel files

In [31]:

save_path="C:/Sites/camino_xlsx/"

totals_df.to_excel(save_path+"camino_total.xlsx", engine="openpyxl")
autonomous_coms_df.to_excel(save_path+"camino_autonomous_comm.xlsx", engine="openpyxl")
routes_df.to_excel(save_path+"camino_routes.xlsx", engine="openpyxl")
continents_df.to_excel(save_path+"camino_continents.xlsx", engine="openpyxl")
ages_df.to_excel(save_path+"camino_ages.xlsx", engine="openpyxl")
groups_df.to_excel(save_path+"camino_groups.xlsx", engine="openpyxl")
transportation_df.to_excel(save_path+"camino_transportation.xlsx", engine="openpyxl")
motives_df.to_excel(save_path+"camino_motives.xlsx", engine="openpyxl")
countries_df.to_excel(save_path+"camino_countries.xlsx", engine="openpyxl")
origin_df.to_excel(save_path+"camino_origin.xlsx", engine="openpyxl")
gender_df.to_excel(save_path+"camino_gender.xlsx", engine="openpyxl")

#### __END__