In [81]:
import pandas as pd
import re
from datetime import datetime, timezone, timedelta

# Extraction

In [82]:
# Kedro Catalog
spreadsheets = catalog.load('spreadsheets')
tangaras = catalog.load('tangaras')

2022-06-06 15:40:14,168 - kedro.io.data_catalog - INFO - Loading data from `spreadsheets` (CSVDataSet)...
2022-06-06 15:40:14,172 - kedro.io.data_catalog - INFO - Loading data from `tangaras` (CSVDataSet)...


In [83]:
# Tangaras
tangaras

Unnamed: 0,MAC,Label_ID,Geolocation,Status
0,D29ESP32DED36FA,Tangara_36FA,,Offline
1,D29ESP32DED1CE2,Tangara_1CE2,,Offline
2,D29ESP32DED1FCA,Tangara_1FCA,,Offline
3,D29ESP32DED14D6,Tangara_14D6,,Offline
4,D29ESP32DED2FF6,Tangara_2FF6,,Offline
5,D29ESP32DED2492,Tangara_2492,,Offline
6,D29TTGOT7D4D7A,Tangara_4D7A,,Offline
7,D29TTGOT7D48C6,CanAirIO_48C6,3.446018 -76.541824,Online
8,D29TTGOT7D532E,CanAirIO_532E,3.446018 -76.541824,Online


In [84]:
# Spreadsheets
spreadsheets

Unnamed: 0,ID,Name,URL
0,1,Week 1,https://docs.google.com/spreadsheets/d/1pSX8Fg...
1,2,Week 2,https://docs.google.com/spreadsheets/d/1Anihf9...
2,3,Week 3,https://docs.google.com/spreadsheets/d/1fiy3aJ...
3,4,Week 4,https://docs.google.com/spreadsheets/d/1v90xfe...
4,5,Week 5,https://docs.google.com/spreadsheets/d/15-CwLf...


In [85]:
# This function will convert the url to a download link
def convert_gsheets_url(url):
    try:
        worksheet_id = url.split('#gid=')[1]
    except:
        # Couldn't get worksheet id. Ignore it
        worksheet_id = None
    url = re.findall('https://docs.google.com/spreadsheets/d/.*?/',url)[0]
    url += 'export'
    url += '?format=csv'
    if worksheet_id:
        url += '&gid={}'.format(worksheet_id)
    return url

In [86]:
# Get Data Frame Sensors
def get_df_sensors(spreadsheets, tangaras):
    df_sensors = {}
    sensors_label = tangaras['Label_ID'].to_list()
    for index, row in spreadsheets.iterrows():
        try:
            url = convert_gsheets_url(row['URL'])
            df = pd.read_csv(url)
            df = df.filter(items=['Time'] + sensors_label)
            df_sensors[row['Name']] = df
            print('From', row['Name'], 'read successfully')
        except Exception:
            print('Could not read any data from', row['ID'], row['Name'], row['URL'])
    return df_sensors

In [87]:
# Data Frame Sensors
df_sensors = get_df_sensors(spreadsheets, tangaras)
df_sensors.keys()

From Week 1 read successfully
From Week 2 read successfully
From Week 3 read successfully
From Week 4 read successfully
From Week 5 read successfully


dict_keys(['Week 1', 'Week 2', 'Week 3', 'Week 4', 'Week 5'])

In [88]:
# Merge Data Frames Sensors
def merge_df_sensors(df_sensors):
    total_rows = 0
    for key, value in df_sensors.items():
        print(key, 'shape', value.shape)
        total_rows += value.shape[0]
    print('Total Rows:', total_rows)
    df_sensors = pd.concat(list(df_sensors.values()))
    df_sensors.rename(columns={'Time':'Datetime'}, inplace=True)
    df_sensors['Datetime'] = pd.to_datetime(df_sensors['Datetime'])
    df_sensors['Datetime'] = df_sensors['Datetime'].apply(lambda x: x.isoformat()+"-05:00")

    df_sensors[df_sensors.columns.to_list()[1:]] = df_sensors[df_sensors.columns.to_list()[1:]].astype('Int64')
    
    print('Columns:', list(df_sensors.columns))
    print('Data Frame Sensors Shape:', df_sensors.shape)
    return df_sensors

In [89]:
# Data Frame Sensors
df_sensors = merge_df_sensors(df_sensors)
df_sensors.head()

Week 1 shape (14395, 10)
Week 2 shape (20159, 9)
Week 3 shape (20093, 9)
Week 4 shape (20082, 9)
Week 5 shape (23040, 9)
Total Rows: 97769
Columns: ['Datetime', 'Tangara_36FA', 'Tangara_1CE2', 'Tangara_1FCA', 'Tangara_14D6', 'Tangara_2FF6', 'Tangara_2492', 'Tangara_4D7A', 'CanAirIO_48C6', 'CanAirIO_532E']
Data Frame Sensors Shape: (97769, 10)


Unnamed: 0,Datetime,Tangara_36FA,Tangara_1CE2,Tangara_1FCA,Tangara_14D6,Tangara_2FF6,Tangara_2492,Tangara_4D7A,CanAirIO_48C6,CanAirIO_532E
0,2022-03-30T00:00:00-05:00,5,4.0,,,3,,2,,
1,2022-03-30T00:00:30-05:00,5,4.0,5.0,4.0,4,,2,3.0,
2,2022-03-30T00:01:00-05:00,4,,5.0,5.0,4,,2,2.0,
3,2022-03-30T00:01:30-05:00,5,4.0,5.0,4.0,4,,1,3.0,
4,2022-03-30T00:02:00-05:00,5,4.0,4.0,4.0,3,,2,1.0,


In [90]:
# Check Data Types
df_sensors.dtypes

Datetime         object
Tangara_36FA      Int64
Tangara_1CE2      Int64
Tangara_1FCA      Int64
Tangara_14D6      Int64
Tangara_2FF6      Int64
Tangara_2492      Int64
Tangara_4D7A      Int64
CanAirIO_48C6     Int64
CanAirIO_532E     Int64
dtype: object

In [91]:
# Save df_sensors into Catalog
catalog.save('raw_data_sensors_csv', df_sensors)

2022-06-06 15:40:30,225 - kedro.io.data_catalog - INFO - Saving data to `raw_data_sensors_csv` (CSVDataSet)...
