# ETL Process Tomography Data

Tomography data are available in two folders:
- data/L2 -> STD, SNR, sincos
- data/L3 -> temperature and current

In [1]:
%load_ext dotenv
%dotenv

In [19]:
import os
import re
import glob
import sys
import pymysql.cursors
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sql_queries import *

In [20]:
conn = pymysql.connect(host=os.getenv("MYSQL_HOST"),
                       port=int(os.getenv("MYSQL_PORT")),
                       user=os.getenv("MYSQL_USER"),
                       password=os.getenv("MYSQL_PASS"),
                       database=os.getenv("MYSQL_DB"))
cur = conn.cursor()

If you want to see all the df, you can set max_rows setting as below

In [21]:
# pd.set_option('display.max_rows', 100)
# pd.reset_option('display.max_rows')

# Stations

Because station information is not provided from the data in the folder, we need to assign station information manually.

Station is grouped in a region which identified from the ID, for example GI-01 and GI-02 are both Gili Iyang stations. New stations need to be assigned this ID which agreed by the team.

As mentioned before, data is separated into two folder: L2 and L3. So each station is also mapped to named folder.

In [22]:
stations_df = pd.read_csv('stations.csv')
stations_df

Unnamed: 0,station_id,name,lat,lon,l2_folder,l3_folder
0,GI-01,Gili Iyang 1,,,GI_01,GI01
1,GI-02,Gili Iyang 2,,,GI_02,GI02
2,GI-03,Gili Iyang 3,,,GI_03,GI03
3,GI-04,Gili Iyang 4,,,GI_04,GI04
4,USL-00,Karang Asem-Bali,-8.392,115.713,USL_00,USL00
5,USL-01,Gili Trawangan-Bali,-8.3488,116.024,USL_01,USL01
6,USL-02,Senggigi-NTB,-8.495,116.038,USL_02,USL02


Insert into Stations table

In [23]:
# Replace np.nan with None for MySQL insert to work
stations_df_2 = stations_df.replace({np.nan: None})

for index, row in stations_df_2.iterrows():
    cur.execute(stations_table_insert, list(row))
    conn.commit()

Run `test.ipynb` to see the inserted stations data

# Process L2 Files

First create regex for finding all csv files with specified format.

- sincos_file_format = '1210619_04_30_00.csv'
- snr_file_format = '1210618_20_40_00_SNR.csv'
- stack_file_format = '1210619_04_10_00to1210619_00_00_00.csv'
- std_file_format = '1210619_04_20_00to1210619_00_00_00_std.csv'



In [24]:
# eg: "USL02 2210913_13:50:23 to 2210913_18:20:23 _SDE.csv"
regex_sde = re.compile('.*_SDE.csv')
regex_date = '(?P<{}>[0-9]{{6}}_[0-9]{{2}}(:|_|\uf03a)[0-9]{{2}}(:|_|\uf03a)[0-9]{{2}})?'
regex_std = re.compile('(?P<stname>[A-Za-z0-9]{2,5})?( )?' + 
                       '(?P<st1>[0-9])?' + regex_date.format("date_start") + '( to )?' +
                       '(?P<st2>[0-9])?' + regex_date.format("date_end") + '( )?_SDE.csv')


In [25]:
def get_files(filepath, regex):
    all_files = []
    count_match = 0
    count_not_match = 0

    for root, dirs, files in os.walk(filepath):
        glob_files = glob.glob(os.path.join(root, '*.csv'))
        for f in glob_files:
            head, tail = os.path.split(f)
            if(regex.match(tail)):
                all_files.append(os.path.abspath(f))
                count_match +=1
            else:
                count_not_match+=1
                
    print('Found {} matching files from total {} files'.format(count_match, count_match+count_not_match))
    return all_files

### Get daily SDE (Previously Max SNR/STD) files from all stations

Get all std files from all stations

In [26]:
# std_files = get_files('data/historical_data/L2', regex_std)
# std_files = get_files('data/historical_all_stations', regex_std)
std_files = get_files('data/historical_all_stations', regex_sde)

std_files[:5]

Found 1021 matching files from total 14435 files


['C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\USL00 0210903_08\uf03a00\uf03a27 to 0210903_14\uf03a40\uf03a27 _SDE.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\USL00 0210903_08\uf03a10\uf03a27 to 0210903_14\uf03a50\uf03a30 _SDE.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\USL00 0210903_08\uf03a20\uf03a27 to 0210903_15\uf03a00\uf03a28 _SDE.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\USL00 0210903_08\uf03a30\uf03a27 to 0210903_15\uf03a10\uf03a27 _SDE.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\USL00 0210903_08\uf03a40\uf03a27 to 0210903_15\uf03a20\uf03a27 _SDE.csv']

Check the std files headers

In [27]:
std_df = pd.read_csv(std_files[0], sep=',')
std_df.columns

Index(['Unnamed: 0', 'df.Time_day', 'df.std.esn'], dtype='object')

Apparently after there are two different type of std csv as we will check further below. 
The one that are we are using is those with `Max_SNR` and `day_in_decimal` header.

Now we are going to build csv files dataframe to easily query which file to ingest

In [28]:
def get_parent_station_folder(filepath):
    return os.path.basename(os.path.dirname(filepath))
def day_decimal_to_timestamp(datetime, day_decimal_ser):
    timestamp_ser = pd.to_datetime(datetime.date()).value + day_decimal_ser%1*86400*1e9
    return pd.to_datetime(timestamp_ser, format='%Y-%m-%d %H:%M:%S')
def parse_date_by_os(str, format):
    if sys.platform == "win32":
        std_date = datetime.strptime(str, format.replace(':', '\uf03a'))
    else:
        std_date = datetime.strptime(str, format)
    return std_date

In [29]:
header_df = pd.DataFrame(columns=['station', 'stfolder', 'date', 'file', 'columns', 'nrows', 'filepath'])
for file in std_files:
    std_df = pd.read_csv(file, sep=',')
    head, filename = os.path.split(file)
    m = regex_std.match(filename)
#     print(filename)
    std_date = parse_date_by_os(m.group("date_start"), '%y%m%d_%H:%M:%S')
#     print(std_date)
    columns_list = list(std_df)
    header_df = header_df.append(
        {
            'file': filename, 
            'columns':''.join(map(str, columns_list)),
            'nrows': len(std_df.index),
            'date': str(std_date.date()),
            'stfolder': get_parent_station_folder(file),
            'filepath': file
        }, ignore_index=True)

# header_df['columns'].unique()


In [30]:
header_df['columns'].unique()

array(['Unnamed: 0df.Time_daydf.std.esn'], dtype=object)

In [31]:
std_files_df = header_df.loc[header_df['columns'] == 'Unnamed: 0df.Time_daydf.std.esn'].copy()
std_files_df

Unnamed: 0,station,stfolder,date,file,columns,nrows,filepath
0,,USL_00,2021-09-03,USL00 0210903_080027 to 0210903_144027 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
1,,USL_00,2021-09-03,USL00 0210903_081027 to 0210903_145030 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
2,,USL_00,2021-09-03,USL00 0210903_082027 to 0210903_150028 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
3,,USL_00,2021-09-03,USL00 0210903_083027 to 0210903_151027 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
4,,USL_00,2021-09-03,USL00 0210903_084027 to 0210903_152027 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
...,...,...,...,...,...,...,...
1016,,USL_02,2021-09-14,USL02 2210914_145022 to 2210914_184022 _SD...,Unnamed: 0df.Time_daydf.std.esn,24,C:\workspace\tomo\tomo-etl\data\historical_all...
1017,,USL_02,2021-09-14,USL02 2210914_150022 to 2210914_185022 _SD...,Unnamed: 0df.Time_daydf.std.esn,24,C:\workspace\tomo\tomo-etl\data\historical_all...
1018,,USL_02,2021-09-14,USL02 2210914_151022 to 2210914_190022 _SD...,Unnamed: 0df.Time_daydf.std.esn,24,C:\workspace\tomo\tomo-etl\data\historical_all...
1019,,USL_02,2021-09-14,USL02 2210914_152022 to 2210914_191022 _SD...,Unnamed: 0df.Time_daydf.std.esn,24,C:\workspace\tomo\tomo-etl\data\historical_all...


Get all maximum number of rows per day as the main **daily aggregated data**

In [32]:
idx = std_files_df.groupby(["stfolder", "date"])['nrows'].transform(max) == std_files_df['nrows']
std_files_df[idx]

Unnamed: 0,station,stfolder,date,file,columns,nrows,filepath
0,,USL_00,2021-09-03,USL00 0210903_080027 to 0210903_144027 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
1,,USL_00,2021-09-03,USL00 0210903_081027 to 0210903_145030 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
2,,USL_00,2021-09-03,USL00 0210903_082027 to 0210903_150028 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
3,,USL_00,2021-09-03,USL00 0210903_083027 to 0210903_151027 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
4,,USL_00,2021-09-03,USL00 0210903_084027 to 0210903_152027 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
...,...,...,...,...,...,...,...
1016,,USL_02,2021-09-14,USL02 2210914_145022 to 2210914_184022 _SD...,Unnamed: 0df.Time_daydf.std.esn,24,C:\workspace\tomo\tomo-etl\data\historical_all...
1017,,USL_02,2021-09-14,USL02 2210914_150022 to 2210914_185022 _SD...,Unnamed: 0df.Time_daydf.std.esn,24,C:\workspace\tomo\tomo-etl\data\historical_all...
1018,,USL_02,2021-09-14,USL02 2210914_151022 to 2210914_190022 _SD...,Unnamed: 0df.Time_daydf.std.esn,24,C:\workspace\tomo\tomo-etl\data\historical_all...
1019,,USL_02,2021-09-14,USL02 2210914_152022 to 2210914_191022 _SD...,Unnamed: 0df.Time_daydf.std.esn,24,C:\workspace\tomo\tomo-etl\data\historical_all...


In [33]:
# This shows max() of each column which return the largest nrows but it also return the 
# "latest" file name which may not be the one with largest nrows. Use this to get the latest data per day. 
std_files_df = std_files_df.groupby(["stfolder", "date"], as_index=False).max()
std_files_df

Unnamed: 0,stfolder,date,station,file,columns,nrows,filepath
0,USL_00,2021-09-03,,USL00 0210903_180027 to 0210903_235027 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
1,USL_00,2021-09-04,,USL00 0210904_153029 to 0210904_235033 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
2,USL_00,2021-09-05,,USL00 0210905_123039 to 0210905_235034 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
3,USL_00,2021-09-06,,USL00 0210906_180030 to 0210906_235035 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
4,USL_00,2021-09-07,,USL00 0210907_180037 to 0210907_235035 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
5,USL_00,2021-09-08,,USL00 0210908_180036 to 0210908_235040 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
6,USL_00,2021-09-09,,USL00 0210909_190026 to 0210909_234028 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...
7,USL_00,2021-09-10,,USL00 0210910_164024 to 0210910_230025 _SD...,Unnamed: 0df.Time_daydf.std.esn,24,C:\workspace\tomo\tomo-etl\data\historical_all...
8,USL_00,2021-09-11,,USL00 2210911_015024 to 2210911_050024 _SD...,Unnamed: 0df.Time_daydf.std.esn,17,C:\workspace\tomo\tomo-etl\data\historical_all...
9,USL_02,2021-09-04,,USL02 2210904_180027 to 2210904_235028 _SD...,Unnamed: 0df.Time_daydf.std.esn,36,C:\workspace\tomo\tomo-etl\data\historical_all...


In [34]:
# Script to check nrows of each file on a given stfolder and date
# std_files_df.loc[(std_files_df['stfolder']=='GI_01') & (std_files_df['date']==pd.to_datetime('2021-06-22').date())]

### Process std file

Test with one of the file as an example

In [35]:
std_file = std_files_df.iloc[3].filepath
std_date = std_files_df.iloc[3].date

cur.execute(get_station_id_l2_sql, (std_files_df.iloc[3].stfolder))
std_station, = cur.fetchone()

std_data_df = pd.read_csv(std_file, sep=',')
std_data_df.head()

Unnamed: 0.1,Unnamed: 0,df.Time_day,df.std.esn
0,1,18:01:45,59.78995
1,2,18:11:45,63.132871
2,3,18:21:44,68.828904
3,4,18:31:45,66.536077
4,5,18:41:45,66.850467


In [36]:
std_data_df['date'] = std_date
std_data_df['timestamp'] = pd.to_datetime(std_data_df['date'] + ' ' + std_data_df['df.Time_day'])
std_data_df['station'] = std_station

std_data_df

Unnamed: 0.1,Unnamed: 0,df.Time_day,df.std.esn,date,timestamp,station
0,1,18:01:45,59.78995,2021-09-06,2021-09-06 18:01:45,USL-00
1,2,18:11:45,63.132871,2021-09-06,2021-09-06 18:11:45,USL-00
2,3,18:21:44,68.828904,2021-09-06,2021-09-06 18:21:44,USL-00
3,4,18:31:45,66.536077,2021-09-06,2021-09-06 18:31:45,USL-00
4,5,18:41:45,66.850467,2021-09-06,2021-09-06 18:41:45,USL-00
5,6,18:51:45,67.395243,2021-09-06,2021-09-06 18:51:45,USL-00
6,7,19:01:45,70.04025,2021-09-06,2021-09-06 19:01:45,USL-00
7,8,19:11:45,70.890581,2021-09-06,2021-09-06 19:11:45,USL-00
8,9,19:21:45,70.215772,2021-09-06,2021-09-06 19:21:45,USL-00
9,10,19:31:45,68.641919,2021-09-06,2021-09-06 19:31:45,USL-00


Insert into Max SNR/STD table

In [37]:
for index, row in std_data_df[['station', 'timestamp', 'df.std.esn']].iterrows():
    cur.execute(max_snr_table_insert, list(row))
    conn.commit()

If all ok then repeat to all the files.

In [38]:
for index, file in std_files_df.iterrows():
    std_file = file.filepath
    std_date = file.date
    
    cur.execute(get_station_id_l2_sql, (file.stfolder))
    std_station, = cur.fetchone()
    
    std_data_df = pd.read_csv(std_file, sep=',')
    std_data_df['date'] = std_date
    std_data_df['timestamp'] = pd.to_datetime(std_data_df['date'] + ' ' + std_data_df['df.Time_day'])
    std_data_df['station'] = std_station
    for index, row in std_data_df[['station', 'timestamp', 'df.std.esn']].iterrows():
        cur.execute(max_snr_table_insert, list(row))
        conn.commit()

### SDR 

In [39]:
#eg: "0210906_01:00:26to0210906_00:50:25_SDR.csv"
regex_sdr_0 = re.compile('.*(_SDR).csv')
regex_sdr = re.compile('(?P<st1>[0-9])?' + regex_date.format("date_start") + 'to' +
                       '(?P<st2>[0-9])?' + regex_date.format("date_end") + '_SDR.csv')

In [40]:
sdr_files = get_files('data/historical_all_stations', regex_sdr_0)

sdr_files[:5]

Found 1757 matching files from total 14435 files


['C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\0210905_02\uf03a00\uf03a25to0210905_01\uf03a00\uf03a25_SDR.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\0210905_02\uf03a10\uf03a25to0210905_01\uf03a00\uf03a25_SDR.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\0210905_02\uf03a20\uf03a25to0210905_01\uf03a00\uf03a25_SDR.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\0210905_02\uf03a30\uf03a25to0210905_01\uf03a00\uf03a25_SDR.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\USL_00\\0210905_02\uf03a40\uf03a24to0210905_01\uf03a00\uf03a25_SDR.csv']

In [41]:
header_df = pd.DataFrame(columns=['station', 'stfolder', 'date', 'file', 'columns', 'nrows', 'filepath'])
for file in sdr_files:
    sdr_df = pd.read_csv(file, sep=' ')
    head, filename = os.path.split(file)
    m = regex_sdr.match(filename)
#     print(filename)
    sdr_date = parse_date_by_os(m.group("date_start"), '%y%m%d_%H:%M:%S')
#     print(sdr_date)
    columns_list = list(sdr_df)
    header_df = header_df.append(
        {
            'file': filename, 
            'columns':''.join(map(str, columns_list)),
            'nrows': len(sdr_df.index),
            'datetime': sdr_date,
            'date': sdr_date.date(),
            'stfolder': get_parent_station_folder(file),
            'filepath': file
        }, ignore_index=True)

# header_df['columns'].unique()


In [42]:
header_df['columns'].unique()

array(['Max_SNRday_in_decimal', 'Max_SNRTime_day'], dtype=object)

In [48]:
# For older files 5-10 Sept 2021
# sdr_files_df = header_df.loc[header_df['columns'] == 'Max_SNRday_in_decimal'].copy()
# sdr_files_df

sdr_files_df = header_df.loc[header_df['columns'] == 'Max_SNRTime_day'].copy()
sdr_files_df

Unnamed: 0,station,stfolder,date,file,columns,nrows,filepath,datetime
588,,USL_00,2021-09-10,0210910_050024to0210910_005023_SDR.csv,Max_SNRTime_day,26,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-10 05:00:24
591,,USL_00,2021-09-10,0210910_053022to0210910_005023_SDR.csv,Max_SNRTime_day,29,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-10 05:30:22
592,,USL_00,2021-09-10,0210910_054023to0210910_005023_SDR.csv,Max_SNRTime_day,30,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-10 05:40:23
593,,USL_00,2021-09-10,0210910_055022to0210910_005023_SDR.csv,Max_SNRTime_day,31,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-10 05:50:22
594,,USL_00,2021-09-10,0210910_060023to0210910_005023_SDR.csv,Max_SNRTime_day,32,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-10 06:00:23
...,...,...,...,...,...,...,...,...
1752,,USL_02,2021-09-14,2210914_184020to2210914_005021_SDR.csv,Max_SNRTime_day,108,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-14 18:40:20
1753,,USL_02,2021-09-14,2210914_185020to2210914_005021_SDR.csv,Max_SNRTime_day,109,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-14 18:50:20
1754,,USL_02,2021-09-14,2210914_190020to2210914_005021_SDR.csv,Max_SNRTime_day,110,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-14 19:00:20
1755,,USL_02,2021-09-14,2210914_191020to2210914_005021_SDR.csv,Max_SNRTime_day,111,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-14 19:10:20


Get all maximum number of rows per day as the main **daily aggregated data**

In [49]:
idx = sdr_files_df.groupby(["stfolder", "date"])['nrows'].transform(max) == sdr_files_df['nrows']
sdr_files_df[idx]

Unnamed: 0,station,stfolder,date,file,columns,nrows,filepath,datetime
664,,USL_00,2021-09-10,0210910_230022to0210910_005023_SDR.csv,Max_SNRTime_day,102,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-10 23:00:22
1247,,USL_02,2021-09-10,2210910_235022to2210910_000022_SDR.csv,Max_SNRTime_day,93,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-10 23:50:22
1377,,USL_02,2021-09-11,2210911_235021to2210911_000021_SDR.csv,Max_SNRTime_day,136,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-11 23:50:21
1511,,USL_02,2021-09-12,2210912_235021to2210912_000021_SDR.csv,Max_SNRTime_day,140,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-12 23:50:21
1644,,USL_02,2021-09-13,2210913_233019to2210913_003021_SDR.csv,Max_SNRTime_day,136,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-13 23:30:19
1756,,USL_02,2021-09-14,2210914_192020to2210914_005021_SDR.csv,Max_SNRTime_day,112,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-14 19:20:20


In [50]:
# This shows max() of each column which return the largest nrows but it also return the 
# "latest" file name which may not be the one with largest nrows. Use this to get the latest data per day. 
sdr_files_df = sdr_files_df.groupby(["stfolder", "date"], as_index=False).max()
sdr_files_df

Unnamed: 0,stfolder,date,station,file,columns,nrows,filepath,datetime
0,USL_00,2021-09-10,,0210910_230022to0210910_005023_SDR.csv,Max_SNRTime_day,102,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-10 23:00:22
1,USL_02,2021-09-10,,2210910_235022to2210910_005022_SDR.csv,Max_SNRTime_day,93,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-10 23:50:22
2,USL_02,2021-09-11,,2210911_235021to2210911_005022_SDR.csv,Max_SNRTime_day,136,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-11 23:50:21
3,USL_02,2021-09-12,,2210912_235021to2210912_005021_SDR.csv,Max_SNRTime_day,140,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-12 23:50:21
4,USL_02,2021-09-13,,2210913_233019to2210913_005020_SDR.csv,Max_SNRTime_day,136,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-13 23:30:19
5,USL_02,2021-09-14,,2210914_192020to2210914_005021_SDR.csv,Max_SNRTime_day,112,C:\workspace\tomo\tomo-etl\data\historical_all...,2021-09-14 19:20:20


In [51]:
# Script to check nrows of each file on a given stfolder and date
# sdr_files_df.loc[(sdr_files_df['stfolder']=='GI_01') & (sdr_files_df['date']==pd.to_datetime('2021-06-22').date())]

In [56]:
for index, file in sdr_files_df.iterrows():
    sdr_file = file.filepath
    sdr_date = file.datetime
    
    cur.execute(get_station_id_l2_sql, (file.stfolder))
    sdr_station, = cur.fetchone()
    
    sdr_data_df = pd.read_csv(sdr_file, sep=' ')
    sdr_data_df['station'] = sdr_station
    
#     sdr_data_df['timestamp'] = day_decimal_to_timestamp(sdr_date, sdr_data_df['day_in_decimal'])
    sdr_data_df['date'] = str(sdr_date)
    sdr_data_df['timestamp'] = pd.to_datetime(sdr_data_df['date'] + ' ' + sdr_data_df['Time_day'])
    
#     print(sdr_data_df.head())
#     break

    for index, row in sdr_data_df[['station', 'timestamp', 'Max_SNR']].iterrows():
        cur.execute(sdr_table_insert, list(row))
        conn.commit()

# Process L3 Files

Next, we want to create regex for L3 csv files with specified format.

- current_file_format = `Curr_G101-GI02.csv`
- temperature_file_format = `Temp_G101-GI03.csv`

In [10]:
regex_current = re.compile('Curr_(?P<src_station>[0-9A-Za-z]{4})-(?P<dest_station>[0-9A-Za-z]{4}).csv')
regex_temp    = re.compile('Temp_(?P<src_station>[0-9A-Za-z]{4})-(?P<dest_station>[0-9A-Za-z]{4}).csv')
regex_test    = re.compile('.*.csv')

### Get All Temperature files

In [11]:
temp_files = get_files('data/historical_data/L3', regex_temp)
temp_files

Found 4 matching files from total 39 files


['C:\\workspace\\tomo\\tomo-etl\\data\\historical_data\\L3\\st00\\Temp_GI01-GI02.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_data\\L3\\st00\\Temp_GI01-GI03.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_data\\L3\\st00\\Temp_GI01-GI04.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_data\\L3\\st02\\Temp_GI03-GI04.csv']

Take a look on one of the data

In [12]:
temp_df = pd.read_csv(temp_files[0], usecols=['Date_In_Decimal', 'degree_(C)'])
temp_df.columns = ['day_in_decimal', 'temperature']
temp_df

Unnamed: 0,day_in_decimal,temperature
0,24.208333,29.238612
1,24.201389,29.219702
2,24.145833,29.224371
3,24.138889,29.223039


### Process Temperature files

In [15]:
def parse_l3_file(file, regex):
    head, tail = os.path.split(file)
    m = regex.match(tail)
    return m.group('src_station'), m.group('dest_station')

def first_or_create_station_link(cur, src_station, dest_station):
    # Query station id from parsed filename and check file naming consistency
    cur.execute(get_station_id_l3_sql, (src_station))
    src_station_result = cur.fetchone()
    cur.execute(get_station_id_l3_sql, (dest_station))
    dest_station_result = cur.fetchone()
    
    if ((src_station_result is None) or (dest_station_result is None)):
        raise ValueError('Unknown station, please check your L3 file naming')
    else:
        src_station_id, = src_station_result
        dest_station_id, = dest_station_result
        
    # Get or create new station link
    cur.execute(get_station_link_sql, (src_station_id, dest_station_id, 
                                       dest_station_id, src_station_id))
    link_result = cur.fetchone()
        
    if link_result is None:
        link_name = f'{src_station_id}_{dest_station_id}'
        cur.execute(station_link_id_insert, (link_name, src_station_id, dest_station_id))
        conn.commit()
        print(f'Created new station link {link_name}')
        
    cur.execute(get_station_link_sql, (src_station_id, dest_station_id, 
                                       dest_station_id, src_station_id))
    return cur.fetchone()

In [16]:
files_count = 0
total_count = len(temp_files)
for file in temp_files:
    print(f'Processing temperature files: {files_count+1}/{total_count} files')
    
    src_station, dest_station = parse_l3_file(file, regex_temp)
    link_id, link_name, src_id, dest_id = first_or_create_station_link(cur, src_station, dest_station)
    
    # Insert temperature data
    temp_df = pd.read_csv(file, usecols=['Date_In_Decimal', 'degree_(C)'])
    temp_df.columns = ['day_in_decimal', 'temperature']
    temp_df['link_id'] = link_id
    # TODO confirm how to get the day in decimal day
    temp_df['timestamp'] = pd.to_datetime(
        pd.to_datetime('20210531', format="%Y%m%d").value + temp_df['day_in_decimal']*86400*1e9, 
        format='%Y-%m-%d %H:%M:%S'
    )
    for index, row in temp_df[['link_id', 'timestamp', 'temperature']].iterrows():
        cur.execute(temp_table_insert, list(row))
        conn.commit()
    
    files_count += 1

Processing temperature files: 1/4 files
Processing temperature files: 2/4 files
Processing temperature files: 3/4 files
Processing temperature files: 4/4 files


### Get All Current files

In [17]:
current_files = get_files('data/historical_data/L3', regex_current)
current_files

Found 4 matching files from total 39 files


['C:\\workspace\\tomo\\tomo-etl\\data\\historical_data\\L3\\st00\\Curr_GI01-GI02.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_data\\L3\\st00\\Curr_GI01-GI03.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_data\\L3\\st00\\Curr_GI01-GI04.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_data\\L3\\st02\\Curr_GI03-GI04.csv']

Take a look on one of the data

In [18]:
current_df = pd.read_csv(current_files[0], usecols = ['Date_In_Decimal', 'Current(m/s)'])
current_df.columns = ['day_in_decimal', 'current']
current_df

Unnamed: 0,day_in_decimal,current
0,24.208333,0.118521
1,24.201389,-0.03879
2,24.145833,0.001246
3,24.138889,-0.009708


### Process Current files

In [21]:
files_count = 0
total_count = len(current_files)
for file in current_files:
    print(f'Processing current files: {files_count+1}/{total_count} files')
    
    src_station, dest_station = parse_l3_file(file, regex_current)
    link_id, link_name, src_id, dest_id = first_or_create_station_link(cur, src_station, dest_station)
    
    # Insert current data
    current_df = pd.read_csv(file, usecols=['Date_In_Decimal', 'Current(m/s)'])
    current_df.columns = ['day_in_decimal', 'current']
    current_df['link_id'] = link_id
    # TODO confirm how to get the day in decimal day
    current_df['timestamp'] = pd.to_datetime(
        pd.to_datetime('20210531', format="%Y%m%d").value + current_df['day_in_decimal']*86400*1e9, 
        format='%Y-%m-%d %H:%M:%S'
    )
    current_df['direction'] = None
    for index, row in current_df[['link_id', 'timestamp', 'current', 'direction']].iterrows():
        cur.execute(current_table_insert, list(row))
        conn.commit()
    
    files_count += 1

Processing current files: 1/4 files
Processing current files: 2/4 files
Processing current files: 3/4 files
Processing current files: 4/4 files


# Close connection to db

In [18]:
cur.close()
conn.close()