# ETL Process Tomography Data

In [1]:
import os
import re
import glob
import pandas as pd
from datetime import datetime

First create regex for finding all csv files with specified format.

sincos_file_format = '1210619_04_30_00.csv'
snr_file_format = '1210618_20_40_00_SNR.csv'
stack_file_format = '1210619_04_10_00to1210619_00_00_00.csv'
std_file_format = '1210619_04_20_00to1210619_00_00_00_std.csv'



In [2]:
regex_date      = '([0-9]{6}_[0-9]{2}_[0-9]{2}_[0-9]{2})'
regex_sincos    = re.compile('(?P<station>[0-9])' + regex_date + '.csv')
regex_snr       = re.compile('(?P<station>[0-9])' + regex_date + '_SNR.csv')
regex_stack     = re.compile('(?P<station>[0-9])' + regex_date + 'to' +
                             '(?P<station2>[0-9])' + regex_date + '.csv')
regex_std       = re.compile('(?P<station>[0-9])' + regex_date + 'to' +
                             '(?P<station2>[0-9])' + regex_date + '_std.csv')

In [3]:
def get_files(filepath, regex):
    all_files = []
    count_match = 0
    count_not_match = 0

    for root, dirs, files in os.walk(filepath):
        glob_files = glob.glob(os.path.join(root, '*.csv'))
        for f in glob_files:
            head, tail = os.path.split(f)
            if(regex.match(tail)):
                all_files.append(os.path.abspath(f))
                count_match +=1
            else:
                count_not_match+=1
    
    print(count_match)
    print(count_not_match)
    return all_files

If you want to see all the df, you can set max_rows setting as below

In [141]:
# pd.set_option('display.max_rows', 100)

# Max SNR/Standard Deviation

### Get daily STD files from all stations

Get all std files from all stations

In [6]:
std_files = get_files('data/historical_all_stations', regex_std)
# std_files = get_files('data/historical_data', regex_std)

std_files[:5]

2604
9052


['C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_04_30_00to1210618_00_00_00_std.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_04_40_00to1210618_00_00_00_std.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_04_50_00to1210618_00_00_00_std.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_05_00_00to1210618_00_00_00_std.csv',
 'C:\\workspace\\tomo\\tomo-etl\\data\\historical_all_stations\\GI_01\\1210618_05_10_00to1210618_00_00_00_std.csv']

Check the std files headers

In [7]:
std_df = pd.read_csv(std_files[0], sep=' ')
std_df.columns

Index(['Date_l', 'Time_l', 'snr_l'], dtype='object')

In [8]:
std_df_2 = pd.read_csv(std_files[134], sep=' ')
std_df_2.columns

Index(['Max_SNR', 'day_in_decimal'], dtype='object')

Apparently after there are two different type of std csv as we will check further below. 
The one that are we are using is those with `Max_SNR` and `day_in_decimal` header.

Now we are going to build csv files dataframe to easily query which file to ingest

In [9]:
def get_station_std (filename):
    return filename[0]

def get_datetime_std (filename):
    return datetime.strptime(filename[1:16], '%y%m%d_%H_%M_%S')

In [10]:
header_df = pd.DataFrame(columns=['file', 'columns', 'nrows', 'station', 'datetime', 'date', 'filepath'])
for file in std_files:
    std_df = pd.read_csv(file, sep=' ')
    filename = os.path.split(file)[1]
    columns_list = list(std_df)
    header_df = header_df.append(
        {
            'file': filename, 
            'columns':''.join(map(str, columns_list)),
            'nrows': len(std_df.index),
            'datetime': get_datetime_std(filename),
            'date': get_datetime_std(filename).date(),
            'station': get_station_std(filename),
            'filepath': file
        }, ignore_index=True)

header_df['columns'].unique()


array(['Date_lTime_lsnr_l', 'Max_SNRday_in_decimal'], dtype=object)

In [11]:
std_files_df = header_df.loc[header_df['columns'] == 'Max_SNRday_in_decimal'].copy()
std_files_df

Unnamed: 0,file,columns,nrows,station,datetime,date,filepath
134,1210619_03_40_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,22,1,2021-06-19 03:40:00,2021-06-19,C:\workspace\tomo\tomo-etl\data\historical_all...
135,1210619_03_50_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,23,1,2021-06-19 03:50:00,2021-06-19,C:\workspace\tomo\tomo-etl\data\historical_all...
136,1210619_04_00_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,24,1,2021-06-19 04:00:00,2021-06-19,C:\workspace\tomo\tomo-etl\data\historical_all...
137,1210619_04_10_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,25,1,2021-06-19 04:10:00,2021-06-19,C:\workspace\tomo\tomo-etl\data\historical_all...
138,1210619_04_20_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,26,1,2021-06-19 04:20:00,2021-06-19,C:\workspace\tomo\tomo-etl\data\historical_all...
...,...,...,...,...,...,...,...
2599,4210624_13_40_00to4210624_00_00_00_std.csv,Max_SNRday_in_decimal,23,4,2021-06-24 13:40:00,2021-06-24,C:\workspace\tomo\tomo-etl\data\historical_all...
2600,4210624_13_50_00to4210624_00_00_00_std.csv,Max_SNRday_in_decimal,23,4,2021-06-24 13:50:00,2021-06-24,C:\workspace\tomo\tomo-etl\data\historical_all...
2601,4210624_14_00_00to4210624_00_00_00_std.csv,Max_SNRday_in_decimal,24,4,2021-06-24 14:00:00,2021-06-24,C:\workspace\tomo\tomo-etl\data\historical_all...
2602,4210624_14_10_00to4210624_00_00_00_std.csv,Max_SNRday_in_decimal,24,4,2021-06-24 14:10:00,2021-06-24,C:\workspace\tomo\tomo-etl\data\historical_all...


Get all maximum number of rows per day as the main daily aggregated data

In [12]:
std_files_df = std_files_df.groupby(["station", "date"], as_index=False).max()
std_files_df

Unnamed: 0,station,date,file,columns,nrows,datetime,filepath
0,1,2021-06-19,1210619_23_40_00to1210619_00_00_00_std.csv,Max_SNRday_in_decimal,140,2021-06-19 23:40:00,C:\workspace\tomo\tomo-etl\data\historical_all...
1,1,2021-06-20,1210620_17_30_00to1210620_00_00_00_std.csv,Max_SNRday_in_decimal,106,2021-06-20 17:30:00,C:\workspace\tomo\tomo-etl\data\historical_all...
2,1,2021-06-21,1210621_23_40_00to1210621_09_50_00_std.csv,Max_SNRday_in_decimal,83,2021-06-21 23:40:00,C:\workspace\tomo\tomo-etl\data\historical_all...
3,1,2021-06-22,1210622_14_20_00to1210622_00_00_00_std.csv,Max_SNRday_in_decimal,85,2021-06-22 14:20:00,C:\workspace\tomo\tomo-etl\data\historical_all...
4,1,2021-06-23,1210623_23_40_00to1210623_00_00_00_std.csv,Max_SNRday_in_decimal,41,2021-06-23 23:40:00,C:\workspace\tomo\tomo-etl\data\historical_all...
5,1,2021-06-24,1210624_13_20_00to1210624_00_00_00_std.csv,Max_SNRday_in_decimal,21,2021-06-24 13:20:00,C:\workspace\tomo\tomo-etl\data\historical_all...
6,2,2021-06-21,2210621_23_40_00to2210621_10_50_00_std.csv,Max_SNRday_in_decimal,78,2021-06-21 23:40:00,C:\workspace\tomo\tomo-etl\data\historical_all...
7,2,2021-06-22,2210622_23_40_00to2210622_00_00_00_std.csv,Max_SNRday_in_decimal,84,2021-06-22 23:40:00,C:\workspace\tomo\tomo-etl\data\historical_all...
8,2,2021-06-23,2210623_23_40_00to2210623_00_00_00_std.csv,Max_SNRday_in_decimal,15,2021-06-23 23:40:00,C:\workspace\tomo\tomo-etl\data\historical_all...
9,2,2021-06-24,2210624_13_00_00to2210624_00_00_00_std.csv,Max_SNRday_in_decimal,17,2021-06-24 13:00:00,C:\workspace\tomo\tomo-etl\data\historical_all...


### Process std file

In [16]:
std_file = std_files_df.iloc[0].filepath

std_data_df = pd.read_csv(std_file, sep=' ')
std_data_df

Unnamed: 0,Max_SNR,day_in_decimal
0,3.467842,7.094907
1,3.061764,7.094792
2,3.203945,7.094676
3,4.263965,7.094560
4,3.241288,7.094444
...,...,...
135,3.271563,7.054051
136,4.036592,7.053935
137,3.370520,7.053819
138,3.689657,7.053704
