# Prepare

In [1]:
# import libraries
import json
import os
import os.path
import pandas as pd

In [2]:
SOURCE = 'Y:\\ZBU\\_Datasets\\GHL\\GHL'  # dataset source dir
TARGET = 'E:\\Datasets\\GHL'
# DTYPES_FILE = 'dtypes.json'  # dtypes of columns

# Conver data

In [4]:
# There are tags in the dataset that do not relate to sensor or control signals.
# For realistic, let's drop them.

# There are duplicates and not integer timestamps.
# For realistic, let's fix it.

# There is impossible to get values every second for process like this.
# For realistic, let's resample to 1 min.

def clear_tags(df: pd.DataFrame) -> pd.DataFrame:
    # rename columns
    rename_dict = {
        'Time': 'time',
        'limiter.y': 'input_temp_gc',  # let's we have this sensor
        'RT_temperature.T': 'rt_temp_gc',
        'HT_temperature.T': 'ht_temp_gc',
        'C_temperature.T': 'ct_temp_gc',
        'RT_level': 'rt_level_m',
        'HT_level': 'ht_level_m',
        'C_level': 'ct_level_m',
        'inj_valve_act': 'input_flow_state',
        'dir_valve_act': 'supply_flow_state',
        'inv_valve_act': 'return_flow_state',
        'out_valve_act': 'output_flow_state',
        'heater_act': 'heater_state',  # device with its own regulator
#         'RT_level_ini': ,  # level in RT at start of input
#         'dT_rand': ,  # random fluctuations of input temperature
#         'dt_rand':,  # random fluctuations of relaxing time
#         'dL_rand':  # random fluctuations of stop level in RT
#         'limiter1.y': 'time_relax',  # resulting relaxing time
#         'Relaxing.active': ,  # relax state is control logic value
#         'boundary.m_flow_in': 'input_flow_m3h',  # nominal input flow
    }
    clear_df = (df[rename_dict.keys()]
                .rename(columns=rename_dict)
               )
    clear_df['time'] = clear_df['time'].astype('int')
    # change temperature from kelvin to celsius
    zero_celsius = 273.0
    clear_df['input_temp_gc'] = clear_df['input_temp_gc'] - zero_celsius
    clear_df['rt_temp_gc'] = clear_df['rt_temp_gc'] - zero_celsius
    clear_df['ht_temp_gc'] = clear_df['ht_temp_gc'] - zero_celsius
    clear_df['ct_temp_gc'] = clear_df['ct_temp_gc'] - zero_celsius
    # add attack tag
    if 'ATTACK' in df.columns:
        clear_df['attack'] = df['ATTACK']
    else:
        clear_df['attack'] = 0
    clear_df = (clear_df
                .drop_duplicates(subset='time', keep='first')
                .set_index('time').loc[0::60]
               )
    clear_df.index = clear_df.index // 60
    return clear_df

In [5]:
def optimize_dtypes(df: pd.DataFrame) -> None:
    # optimize dataframe by memory usage
    uint_columns = df.columns.values[7:]  # this columns can be uint
    float_columns = df.columns.values[:7]  # other must be float
    
    df[uint_columns] = df[uint_columns].apply(pd.to_numeric, downcast='unsigned')
    df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast='float')
    
    # saving our dtypes description for further use
#     dtypes_file = os.path.join(DIR, DTYPES_FILE)
#     if not os.path.isfile(dtypes_file):
#         # we need to create it
#         names = df.dtypes.index  # columns names
#         types = [c.name for c in df.dtypes]  # columns types
#         dtypes_dict = dict(zip(names, types))  # dict for pandas.read_csv
#         with open(dtypes_file, 'w') as f:
#             json.dump(dtypes_dict, f)
        
    return

In [6]:
print_memusage('Before loading')
print()

inp_dir = 'GHL'
train_dir = 'train'
test_dir = 'test'

files = os.listdir(os.path.join(SOURCE, inp_dir))

if not os.path.isdir(os.path.join(TARGET, train_dir)):
    os.mkdir(os.path.join(TARGET, train_dir))

if not os.path.isdir(os.path.join(TARGET, test_dir)):
    os.mkdir(os.path.join(TARGET, test_dir))

for f in files:
    data = pd.read_csv(os.path.join(SOURCE, inp_dir, f))
    print_memusage('After reading ' + f)
    
    data_ = clear_tags(data)
    
    optimize_dtypes(data_)
    
    f_ = os.path.splitext(f)[0] + '.snappy'
    if data_['attack'].max() > 0:
        data_.to_parquet(os.path.join(TARGET, test_dir, f_), compression='snappy')
    else:
        data_.to_parquet(os.path.join(TARGET, train_dir, f_), compression='snappy')
    print()

Before loading 105.59 MB

After reading 01_Lev_fault_Temp_corr_seed_11_vars_23.csv 142.46 MB

After reading 02_Lev_fault_Temp_corr_seed_17_vars_23.csv 148.28 MB

After reading 03_Lev_fault_Temp_corr_seed_19_vars_23.csv 148.74 MB

After reading 04_Lev_fault_Temp_corr_seed_23_vars_23.csv 149.04 MB

After reading 05_Lev_fault_Temp_corr_seed_27_vars_23.csv 149.13 MB

After reading 06_Lev_fault_Temp_corr_seed_29_vars_23.csv 149.43 MB

After reading 07_Lev_fault_Temp_corr_seed_31_vars_23.csv 149.24 MB

After reading 08_Lev_fault_Temp_corr_seed_33_vars_23.csv 149.55 MB

After reading 09_Lev_fault_Temp_corr_seed_37_vars_23.csv 149.36 MB

After reading 10_Lev_fault_Temp_corr_seed_39_vars_23.csv 149.37 MB

After reading 11_Lev_fault_Temp_corr_seed_41_vars_23.csv 149.73 MB

After reading 12_Lev_fault_Temp_corr_seed_43_vars_23.csv 149.82 MB

After reading 13_Lev_fault_Temp_corr_seed_666_vars_23.csv 149.71 MB

After reading 14_Lev_fault_Temp_corr_seed_47_vars_23.csv 149.70 MB

After reading 15_Lev_