# ETL Process Tomography Data

Tomography data are available in two folders:
- data/L2 -> STD, SNR, sincos
- data/L3 -> temperature and current

In [1]:
import os
import re
import glob
import pymysql.cursors
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sql_queries import *

In [42]:
conn = pymysql.connect(host='localhost', user='root', password='mypassword', database='tomo')
cur = conn.cursor()

In [113]:
# pd.set_option('display.max_rows', 100)
# pd.reset_option('display.max_rows')

# Stations

Because station information is not provided from the data in the folder, we need to assign station information manually.

Station is grouped in a region which identified from the ID, for example GI-01 and GI-02 are both Gili Iyang stations. New stations need to be assigned this ID which agreed by the team.

As mentioned before, data is separated into two folder: L2 and L3. So each station is also mapped to named folder.

In [3]:
stations_id = ['GI-01', 'GI-02', 'GI-03', 'GI-04']
stations_name = [None, None, None, None]
stations_l2_folder = ['GI_01', 'GI_02', 'GI_03', 'GI_04']
stations_l3_folder = ['st00', 'st01', 'st02', 'st03']
stations_current_folder = ['st00', 'st01', 'st02', 'st03']
stations_lat = [None, None, None, None]
stations_lon = [None, None, None, None]

stations_df = pd.DataFrame({
    'station_id': stations_id, 
    'name': stations_name,
    'l2_folder': stations_l2_folder,
    'l3_folder': stations_l3_folder,
    'lat': stations_lat,
    'lon': stations_lon
})

stations_df

Unnamed: 0,station_id,name,l2_folder,l3_folder,lat,lon
0,GI-01,,GI_01,st00,,
1,GI-02,,GI_02,st01,,
2,GI-03,,GI_03,st02,,
3,GI-04,,GI_04,st03,,


In [4]:
def get_station_id(stations_df, stfolder):
    result = stations_df.loc[(stations_df['l2_folder']==stfolder) | 
                             (stations_df['l3_folder']==stfolder)]
    if len(result)>1:
        raise ValueError('Multiple station folders found') 
    return result.iloc[0].station_id

In [5]:
get_station_id(stations_df, 'GI_01')

'GI-01'

Insert into Stations table

In [43]:
for index, row in stations_df[['station_id', 'name', 'lat', 'lon']].iterrows():
    cur.execute(stations_table_insert, list(row))
    conn.commit()

Run `test.ipynb` to see the inserted stations data

# Process L2 Files

First create regex for finding all csv files with specified format.

- sincos_file_format = '1210619_04_30_00.csv'
- snr_file_format = '1210618_20_40_00_SNR.csv'
- stack_file_format = '1210619_04_10_00to1210619_00_00_00.csv'
- std_file_format = '1210619_04_20_00to1210619_00_00_00_std.csv'



In [7]:
regex_date      = '([0-9]{6}_[0-9]{2}_[0-9]{2}_[0-9]{2})'
regex_sincos    = re.compile('(?P<station>[0-9])' + regex_date + '.csv')
regex_snr       = re.compile('(?P<station>[0-9])' + regex_date + '_SNR.csv')
regex_stack     = re.compile('(?P<station>[0-9])' + regex_date + 'to' +
                             '(?P<station2>[0-9])' + regex_date + '.csv')
regex_std       = re.compile('(?P<station>[0-9])' + regex_date + 'to' +
                             '(?P<station2>[0-9])' + regex_date + '_std.csv')

In [8]:
def get_files(filepath, regex):
    all_files = []
    count_match = 0
    count_not_match = 0

    for root, dirs, files in os.walk(filepath):
        glob_files = glob.glob(os.path.join(root, '*.csv'))
        for f in glob_files:
            head, tail = os.path.split(f)
            if(regex.match(tail)):
                all_files.append(os.path.abspath(f))
                count_match +=1
            else:
                count_not_match+=1
    
    print('Found {} matching files from total {} files'.format(count_match, count_match+count_not_match))
    return all_files

If you want to see all the df, you can set max_rows setting as below

### Get daily Max SNR/STD files from all stations

Get all std files from all stations

In [31]:
std_files = get_files('data/historical_all_stations', regex_std)
# std_files = get_files('data/historical_data', regex_std)

std_files[:5]

Found 2604 matching files from total 11656 files


['C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_04_30_00to1210618_00_00_00_std.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_04_40_00to1210618_00_00_00_std.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_04_50_00to1210618_00_00_00_std.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_05_00_00to1210618_00_00_00_std.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_05_10_00to1210618_00_00_00_std.csv']

Check the std files headers

In [32]:
std_df = pd.read_csv(std_files[0], sep=' ')
std_df.columns

Index(['Date_l', 'Time_l', 'snr_l'], dtype='object')

In [33]:
std_df_2 = pd.read_csv(std_files[134], sep=' ')
std_df_2.columns

Index(['Max_SNR', 'day_in_decimal'], dtype='object')

Apparently after there are two different type of std csv as we will check further below. 
The one that are we are using is those with `Max_SNR` and `day_in_decimal` header.

Now we are going to build csv files dataframe to easily query which file to ingest

In [34]:
def get_std_station_folder(filepath):
    return os.path.basename(os.path.dirname(filepath))
def get_std_datetime(filename):
    return datetime.strptime(filename[1:16], '%y%m%d_%H_%M_%S')

In [None]:
header_df = pd.DataFrame(columns=['station', 'stfolder', 'date', 'datetime', 'file', 'columns', 'nrows', 'filepath'])
for file in std_files:
    std_df = pd.read_csv(file, sep=' ')
    filename = os.path.split(file)[1]
    columns_list = list(std_df)
    header_df = header_df.append(
        {
            'file': filename, 
            'columns':''.join(map(str, columns_list)),
            'nrows': len(std_df.index),
            'datetime': get_std_datetime(filename),
            'date': get_std_datetime(filename).date(),
            'stfolder': get_std_station_folder(file),
            'filepath': file
        }, ignore_index=True)

header_df['columns'].unique()


array(['Date_lTime_lsnr_l', 'Max_SNRday_in_decimal'], dtype=object)

In [114]:
std_files_df = header_df.loc[header_df['columns'] == 'Max_SNRday_in_decimal'].copy()
std_files_df

Unnamed: 0,station,stfolder,date,datetime,file,columns,nrows,filepath
134,,GI_01,2021-06-19,2021-06-19 03:40:00,1210619_03_40_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,22,C:\workspace\tomo\tomo-etl\data\historical_all...
135,,GI_01,2021-06-19,2021-06-19 03:50:00,1210619_03_50_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,23,C:\workspace\tomo\tomo-etl\data\historical_all...
136,,GI_01,2021-06-19,2021-06-19 04:00:00,1210619_04_00_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,24,C:\workspace\tomo\tomo-etl\data\historical_all...
137,,GI_01,2021-06-19,2021-06-19 04:10:00,1210619_04_10_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,25,C:\workspace\tomo\tomo-etl\data\historical_all...
138,,GI_01,2021-06-19,2021-06-19 04:20:00,1210619_04_20_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,26,C:\workspace\tomo\tomo-etl\data\historical_all...
...,...,...,...,...,...,...,...,...
2599,,GI_04,2021-06-24,2021-06-24 13:40:00,4210624_13_40_00to4210624_00_00_00_std.csv,Max_SNRday_in_decimal,23,C:\workspace\tomo\tomo-etl\data\historical_all...
2600,,GI_04,2021-06-24,2021-06-24 13:50:00,4210624_13_50_00to4210624_00_00_00_std.csv,Max_SNRday_in_decimal,23,C:\workspace\tomo\tomo-etl\data\historical_all...
2601,,GI_04,2021-06-24,2021-06-24 14:00:00,4210624_14_00_00to4210624_00_00_00_std.csv,Max_SNRday_in_decimal,24,C:\workspace\tomo\tomo-etl\data\historical_all...
2602,,GI_04,2021-06-24,2021-06-24 14:10:00,4210624_14_10_00to4210624_00_00_00_std.csv,Max_SNRday_in_decimal,24,C:\workspace\tomo\tomo-etl\data\historical_all...


In [None]:
std_files_df = header_df.loc[header_df['columns'] == 'Max_SNRday_in_decimal'].copy()
std_files_df

Get all maximum number of rows per day as the main **daily aggregated data**

In [115]:
idx = std_files_df.groupby(["stfolder", "date"])['nrows'].transform(max) == std_files_df['nrows']
std_files_df[idx]

Unnamed: 0,station,stfolder,date,datetime,file,columns,nrows,filepath
252,,GI_01,2021-06-19,2021-06-19 23:40:00,1210619_23_40_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,140,C:\workspace\tomo\tomo-etl\data\historical_all...
356,,GI_01,2021-06-20,2021-06-20 17:30:00,1210620_17_30_00to1210620_00_00_00_std.csv,Max_SNRday_in_decimal,106,C:\workspace\tomo\tomo-etl\data\historical_all...
438,,GI_01,2021-06-21,2021-06-21 23:40:00,1210621_23_40_00to1210621_09_50_00_std.csv,Max_SNRday_in_decimal,83,C:\workspace\tomo\tomo-etl\data\historical_all...
522,,GI_01,2021-06-22,2021-06-22 14:00:00,1210622_14_00_00to1210622_00_00_00_std.csv,Max_SNRday_in_decimal,85,C:\workspace\tomo\tomo-etl\data\historical_all...
569,,GI_01,2021-06-23,2021-06-23 22:40:00,1210623_22_40_00to1210623_00_00_00_std.csv,Max_SNRday_in_decimal,41,C:\workspace\tomo\tomo-etl\data\historical_all...
570,,GI_01,2021-06-23,2021-06-23 22:50:00,1210623_22_50_00to1210623_00_00_00_std.csv,Max_SNRday_in_decimal,41,C:\workspace\tomo\tomo-etl\data\historical_all...
571,,GI_01,2021-06-23,2021-06-23 23:00:00,1210623_23_00_00to1210623_00_00_00_std.csv,Max_SNRday_in_decimal,41,C:\workspace\tomo\tomo-etl\data\historical_all...
572,,GI_01,2021-06-23,2021-06-23 23:10:00,1210623_23_10_00to1210623_00_00_00_std.csv,Max_SNRday_in_decimal,41,C:\workspace\tomo\tomo-etl\data\historical_all...
573,,GI_01,2021-06-23,2021-06-23 23:20:00,1210623_23_20_00to1210623_00_00_00_std.csv,Max_SNRday_in_decimal,41,C:\workspace\tomo\tomo-etl\data\historical_all...
574,,GI_01,2021-06-23,2021-06-23 23:30:00,1210623_23_30_00to1210623_00_00_00_std.csv,Max_SNRday_in_decimal,41,C:\workspace\tomo\tomo-etl\data\historical_all...


In [116]:
# This shows max() of each column which return the largest nrows but it also return the 
# "latest" file name which may not be the one with largest nrows. Use this to get the latest data per day. 
std_files_df = std_files_df.groupby(["stfolder", "date"], as_index=False).max()
std_files_df

Unnamed: 0,stfolder,date,station,datetime,file,columns,nrows,filepath
0,GI_01,2021-06-19,,2021-06-19 23:40:00,1210619_23_40_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,140,C:\workspace\tomo\tomo-etl\data\historical_all...
1,GI_01,2021-06-20,,2021-06-20 17:30:00,1210620_17_30_00to1210620_00_00_00_std.csv,Max_SNRday_in_decimal,106,C:\workspace\tomo\tomo-etl\data\historical_all...
2,GI_01,2021-06-21,,2021-06-21 23:40:00,1210621_23_40_00to1210621_09_50_00_std.csv,Max_SNRday_in_decimal,83,C:\workspace\tomo\tomo-etl\data\historical_all...
3,GI_01,2021-06-22,,2021-06-22 14:20:00,1210622_14_20_00to1210622_00_00_00_std.csv,Max_SNRday_in_decimal,85,C:\workspace\tomo\tomo-etl\data\historical_all...
4,GI_01,2021-06-23,,2021-06-23 23:40:00,1210623_23_40_00to1210623_00_00_00_std.csv,Max_SNRday_in_decimal,41,C:\workspace\tomo\tomo-etl\data\historical_all...
5,GI_01,2021-06-24,,2021-06-24 13:20:00,1210624_13_20_00to1210624_00_00_00_std.csv,Max_SNRday_in_decimal,21,C:\workspace\tomo\tomo-etl\data\historical_all...
6,GI_02,2021-06-21,,2021-06-21 23:40:00,2210621_23_40_00to2210621_10_50_00_std.csv,Max_SNRday_in_decimal,78,C:\workspace\tomo\tomo-etl\data\historical_all...
7,GI_02,2021-06-22,,2021-06-22 23:40:00,2210622_23_40_00to2210622_00_00_00_std.csv,Max_SNRday_in_decimal,84,C:\workspace\tomo\tomo-etl\data\historical_all...
8,GI_02,2021-06-23,,2021-06-23 23:40:00,2210623_23_40_00to2210623_00_00_00_std.csv,Max_SNRday_in_decimal,15,C:\workspace\tomo\tomo-etl\data\historical_all...
9,GI_02,2021-06-24,,2021-06-24 13:00:00,2210624_13_00_00to2210624_00_00_00_std.csv,Max_SNRday_in_decimal,17,C:\workspace\tomo\tomo-etl\data\historical_all...


In [110]:
# Script to check nrows of each file on a given stfolder and date
# std_files_df.loc[(std_files_df['stfolder']=='GI_01') & (std_files_df['date']==pd.to_datetime('2021-06-22').date())]

### Process std file

Test with one of the file as an example

In [118]:
std_file = std_files_df.iloc[3].filepath
std_date = std_files_df.iloc[3].datetime
std_station = get_station_id(stations_df, std_files_df.iloc[3].stfolder)

std_data_df = pd.read_csv(std_file, sep=' ')
std_data_df.head()

Unnamed: 0,Max_SNR,day_in_decimal
0,3.50052,7.212963
1,3.59045,7.2125
2,3.869221,7.21169
3,3.595714,7.211574
4,3.659676,7.211227


NOTE: day_in_decimal is time in UTC parsed as 'x.y' where x is the day in given month (from filename) and y time in 24 hour.
eg: For file '1210623_23_40_00to1210623_00_00_00_std.csv' and day_in_decimal 23.979167, it means 
the data date is 23 June 2021 and the time is 0.979167*24=23.500008 in hour or 23:30:00.0288 

But due to some issues, some early files might have differences between '<x>' and the date from the filename. In that case get day from the file name. 

In [39]:
timestamp = pd.to_datetime(std_date.date()).value + std_data_df['day_in_decimal']%1*86400*1e9
std_data_df['timestamp'] = pd.to_datetime(timestamp, format='%Y-%m-%d %H:%M:%S')
std_data_df['station'] = std_station
std_data_df

Unnamed: 0,Max_SNR,day_in_decimal,timestamp,station
0,3.537569,23.979167,2021-06-23 23:30:00,GI-02
1,3.520687,23.875,2021-06-23 21:00:00,GI-02
2,3.611715,23.854167,2021-06-23 20:30:00,GI-02
3,3.59768,23.847222,2021-06-23 20:20:00,GI-02
4,3.550611,23.75,2021-06-23 18:00:00,GI-02
5,3.569091,23.701389,2021-06-23 16:50:00,GI-02
6,3.614786,23.458333,2021-06-23 11:00:00,GI-02
7,3.648344,23.395833,2021-06-23 09:30:00,GI-02
8,3.58318,23.270833,2021-06-23 06:30:00,GI-02
9,3.543587,23.263889,2021-06-23 06:20:00,GI-02


Insert into Max SNR/STD table

In [44]:
for index, row in std_data_df[['station', 'timestamp', 'Max_SNR']].iterrows():
    cur.execute(max_snr_table_insert, list(row))
    conn.commit()

['GI-02', Timestamp('2021-06-23 23:30:00'), 3.53756913144492]
['GI-02', Timestamp('2021-06-23 21:00:00'), 3.52068690446643]
['GI-02', Timestamp('2021-06-23 20:30:00'), 3.61171494415718]
['GI-02', Timestamp('2021-06-23 20:20:00'), 3.59768004276514]
['GI-02', Timestamp('2021-06-23 18:00:00'), 3.55061148394746]
['GI-02', Timestamp('2021-06-23 16:50:00'), 3.56909080494229]
['GI-02', Timestamp('2021-06-23 11:00:00'), 3.61478573863387]
['GI-02', Timestamp('2021-06-23 09:30:00'), 3.6483442613954]
['GI-02', Timestamp('2021-06-23 06:30:00'), 3.58318012576627]
['GI-02', Timestamp('2021-06-23 06:20:00'), 3.54358732753596]
['GI-02', Timestamp('2021-06-23 05:00:00'), 3.65365883812563]
['GI-02', Timestamp('2021-06-23 04:00:00'), 3.5199921255672]
['GI-02', Timestamp('2021-06-23 02:20:00'), 3.53654817456685]
['GI-02', Timestamp('2021-06-23 00:30:00'), 3.67024881153892]
['GI-02', Timestamp('2021-06-23 00:20:00'), 3.54843547875514]


If all ok then repeat to all the files.

In [46]:
for index, file in std_files_df.iterrows():
    std_file = file.filepath
    std_date = file.datetime
    std_station = get_station_id(stations_df, file.stfolder)
    std_data_df = pd.read_csv(std_file, sep=' ')
    timestamp = pd.to_datetime(std_date.date()).value + std_data_df['day_in_decimal']%1*86400*1e9
    std_data_df['timestamp'] = pd.to_datetime(timestamp, format='%Y-%m-%d %H:%M:%S')
    std_data_df['station'] = std_station
    for index, row in std_data_df[['station', 'timestamp', 'Max_SNR']].iterrows():
        cur.execute(max_snr_table_insert, list(row))
        conn.commit()

# Close connection to db

In [23]:
cur.close()
conn.close()

Error: Already closed