# Prepare

In [1]:
# import libraries
import json
import os
import os.path
import pandas as pd
import psutil

In [2]:
DIR = 'E:\\Datasets\\GHL'  # dataset source dir
DTYPES_FILE = 'dtypes.json'  # dtypes of columns

In [3]:
proc = psutil.Process(os.getpid())

def print_memusage(prefix=''):
# print memory usage info
    print(prefix, f'{proc.memory_info().rss/1024**2:0.2f} MB')

# Conver data

In [4]:
# There are tags in the dataset that do not relate to sensor or control signals.
# For realistic, let's drop them.

# There are duplicates and not integer timestamps.
# For realistic, let's fix it.

# There is impossible to get values every second for process like this.
# For realistic, let's resample to 1 min.

def clear_tags(df: pd.DataFrame) -> pd.DataFrame:
    # rename columns
    rename_dict = {
        'Time': 'time',
        'limiter.y': 'input_temp_gc',  # let's we have this sensor
        'RT_temperature.T': 'rt_temp_gc',
        'HT_temperature.T': 'ht_temp_gc',
        'C_temperature.T': 'ct_temp_gc',
        'RT_level': 'rt_level_m',
        'HT_level': 'ht_level_m',
        'C_level': 'ct_level_m',
        'inj_valve_act': 'input_flow_state',
        'dir_valve_act': 'supply_flow_state',
        'inv_valve_act': 'return_flow_state',
        'out_valve_act': 'output_flow_state',
        'heater_act': 'heater_state',  # device with its own regulator
        
#         'RT_level_ini': ,  # level in RT at start of input
        
#         'dT_rand': ,  # random fluctuations of input temperature
#         'dt_rand':,  # random fluctuations of relaxing time
#         'dL_rand':  # random fluctuations of stop level in RT
#         'limiter1.y': 'time_relax',  # resulting relaxing time
        
#         'Relaxing.active': ,  # relax state is control logic value
        
#         'boundary.m_flow_in': 'input_flow_m3h',  # nominal input flow
    }
    clear_df = (df[rename_dict.keys()]
                .rename(columns=rename_dict)
               )
    clear_df['time'] = clear_df['time'].astype('int')
    # change temperature from kelvin to celsius
    zero_celsius = 273.0
    clear_df['input_temp_gc'] = clear_df['input_temp_gc'] - zero_celsius
    clear_df['rt_temp_gc'] = clear_df['rt_temp_gc'] - zero_celsius
    clear_df['ht_temp_gc'] = clear_df['ht_temp_gc'] - zero_celsius
    clear_df['ct_temp_gc'] = clear_df['ct_temp_gc'] - zero_celsius
    # add attack tag
    if 'ATTACK' in df.columns:
        clear_df['attack'] = df['ATTACK']
    else:
        clear_df['attack'] = 0
    clear_df = (clear_df
                .drop_duplicates(subset='time', keep='first')
                .set_index('time').loc[0::60]
               )
    clear_df.index = clear_df.index // 60
    return clear_df

In [5]:
def optimize_dtypes(df: pd.DataFrame) -> None:
    # optimize dataframe by memory usage
    uint_columns = df.columns.values[7:]  # this columns can be uint
    float_columns = df.columns.values[:7]  # other must be float
    
    df[uint_columns] = df[uint_columns].apply(pd.to_numeric, downcast='unsigned')
    df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast='float')
    
    # saving our dtypes description for further use
    dtypes_file = os.path.join(DIR, DTYPES_FILE)
    if not os.path.isfile(dtypes_file):
        # we need to create it
        names = df.dtypes.index  # columns names
        types = [c.name for c in df.dtypes]  # columns types
        dtypes_dict = dict(zip(names, types))  # dict for pandas.read_csv
        with open(dtypes_file, 'w') as f:
            json.dump(dtypes_dict, f)
        
    return

In [6]:
print_memusage('Before loading')
print()

inp_dir = 'GHL'
train_dir = 'train'
test_dir = 'test'

files = os.listdir(os.path.join(DIR, inp_dir))

if not os.path.isdir(os.path.join(DIR, train_dir)):
    os.mkdir(os.path.join(DIR, train_dir))

if not os.path.isdir(os.path.join(DIR, test_dir)):
    os.mkdir(os.path.join(DIR, test_dir))

for f in files:
    data = pd.read_csv(os.path.join(DIR, inp_dir, f))
    print_memusage('After reading ' + f)
    
    data_ = clear_tags(data)
    
    optimize_dtypes(data_)
    
    if data_['attack'].max() > 0:
        data_.to_csv(os.path.join(DIR, test_dir, f))
    else:
        data_.to_csv(os.path.join(DIR, train_dir, f))
    print()

Before loading 96.48 MB

After reading 01_Lev_fault_Temp_corr_seed_11_vars_23.csv 132.85 MB

After reading 02_Lev_fault_Temp_corr_seed_17_vars_23.csv 136.04 MB

After reading 03_Lev_fault_Temp_corr_seed_19_vars_23.csv 135.88 MB

After reading 04_Lev_fault_Temp_corr_seed_23_vars_23.csv 136.13 MB

After reading 05_Lev_fault_Temp_corr_seed_27_vars_23.csv 136.93 MB

After reading 06_Lev_fault_Temp_corr_seed_29_vars_23.csv 136.62 MB

After reading 07_Lev_fault_Temp_corr_seed_31_vars_23.csv 136.14 MB

After reading 08_Lev_fault_Temp_corr_seed_33_vars_23.csv 135.73 MB

After reading 09_Lev_fault_Temp_corr_seed_37_vars_23.csv 137.60 MB

After reading 10_Lev_fault_Temp_corr_seed_39_vars_23.csv 136.59 MB

After reading 11_Lev_fault_Temp_corr_seed_41_vars_23.csv 136.94 MB

After reading 12_Lev_fault_Temp_corr_seed_43_vars_23.csv 137.71 MB

After reading 13_Lev_fault_Temp_corr_seed_666_vars_23.csv 135.97 MB

After reading 14_Lev_fault_Temp_corr_seed_47_vars_23.csv 137.23 MB

After reading 15_Lev_f

In [7]:
data_.info()  # last file is train dataset

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25001 entries, 0 to 25000
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   input_temp_gc      25001 non-null  float32
 1   rt_temp_gc         25001 non-null  float32
 2   ht_temp_gc         25001 non-null  float32
 3   ct_temp_gc         25001 non-null  float32
 4   rt_level_m         25001 non-null  float32
 5   ht_level_m         25001 non-null  float32
 6   ct_level_m         25001 non-null  float32
 7   input_flow_state   25001 non-null  uint8  
 8   supply_flow_state  25001 non-null  uint8  
 9   return_flow_state  25001 non-null  uint8  
 10  output_flow_state  25001 non-null  uint8  
 11  heater_state       25001 non-null  uint8  
 12  attack             25001 non-null  uint8  
dtypes: float32(7), uint8(6)
memory usage: 1.0 MB


In [8]:
data_.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
input_temp_gc,25001.0,11.605475,7.948647,1.0,4.290588,11.343414,17.714172,30.731842
rt_temp_gc,25001.0,40.180851,12.439198,10.033295,29.534515,41.373596,51.736267,58.136566
ht_temp_gc,25001.0,49.310143,9.839715,10.149994,43.406952,53.073364,57.031525,60.0
ct_temp_gc,25001.0,56.721848,4.493768,0.149994,57.151245,57.243164,57.281342,57.302612
rt_level_m,25001.0,1.762536,0.514036,0.5,1.417623,1.820883,2.179197,2.7
ht_level_m,25001.0,0.434892,0.229787,0.1,0.1,0.6,0.6,0.6
ct_level_m,25001.0,6.846371,3.93999,0.01,3.438636,6.900535,10.287628,13.634927
input_flow_state,25001.0,0.113755,0.31752,0.0,0.0,0.0,0.0,1.0
supply_flow_state,25001.0,0.029239,0.168479,0.0,0.0,0.0,0.0,1.0
return_flow_state,25001.0,0.031559,0.174826,0.0,0.0,0.0,0.0,1.0
