In [1]:
import json
import glob
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

트랜스포머 구현 참고자료:
- https://www.tensorflow.org/tutorials/text/transformer
- http://jalammar.github.io/illustrated-transformer/

In [2]:
files = [Path(x) for x in glob.glob('../data/raw_data/*/*.csv')]
# files = [Path(x) for x in glob.glob('../../*/*.csv')]
names = [f'{x.parent.name}_{x.stem}' for x in files]
loca_types = [f'{x.parent.name[:7]}' for x in files]
feature_cols = [40,46,47]

In [4]:
def load_data(filepath, name):
    df=pd.read_csv(filepath, header=None).astype(np.float32)
    param = df.iloc[0][1:9]
    param=param.to_dict()
    param['name']=name

    df_data = df[feature_cols]
    df_data.columns = [f'col_{i}' for i in df_data.columns]
    return param, df_data

In [7]:
datasets = [(f,name) for f,name,loca_type in zip(files, names, loca_types)
            if loca_type=='0.5inch' or loca_type=='1.0inch' or loca_type=='1.5inch']
np.random.shuffle(datasets)
print("Number of files: ", len(datasets))

Number of files:  6003


In [8]:
trainsets=datasets[:4800]
valsets=datasets[4800:5400]
testsets=datasets[5400:]

In [9]:
%time traindata = [load_data(path,name) for path,name in trainsets]
%time valdata = [load_data(path,name) for path,name in valsets]
%time testdata = [load_data(path,name) for path,name in testsets]

CPU times: user 5min 57s, sys: 46.2 s, total: 6min 43s
Wall time: 6min 53s
CPU times: user 42 s, sys: 5.18 s, total: 47.2 s
Wall time: 48.5 s
CPU times: user 42.3 s, sys: 4.81 s, total: 47.1 s
Wall time: 48.4 s


In [10]:
traindata = [(x[0], x[1][:10000]) for x in traindata if len(x[1])>=10000]
valdata = [(x[0], x[1][:10000]) for x in valdata if len(x[1])>=10000]
testdata = [(x[0], x[1][:10000]) for x in testdata if len(x[1])>=10000]

In [11]:
train_features = pd.DataFrame([p for p,df in traindata]).set_index('name').add_prefix("param_")
train_labels = pd.concat([df for p,df in traindata])

val_features = pd.DataFrame([p for p,df in valdata]).set_index('name').add_prefix("param_")
val_labels = pd.concat([df for p,df in valdata])

test_features = pd.DataFrame([p for p,df in testdata]).set_index('name').add_prefix("param_")
test_labels = pd.concat([df for p,df in testdata])

In [18]:
# train_features = pd.read_parquet('../data/raw.params.train.parquet')
# train_labels = pd.read_parquet('../data/raw.graphs.train.parquet')

In [19]:
feature_scaler = MinMaxScaler()
feature_scaler.fit(train_features)
label_scaler = MinMaxScaler()
label_scaler.fit(train_labels)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [20]:
def apply_scaler(df, scaler):
    df_scaled=scaler.transform(df)
    df_scaled=pd.DataFrame(df_scaled, index=df.index, columns=df.columns)
    return df_scaled.astype(np.float32)

In [13]:
train_features_scaled = apply_scaler(train_features, feature_scaler)
val_features_scaled   = apply_scaler(val_features, feature_scaler)
test_features_scaled  = apply_scaler(test_features, feature_scaler)

train_labels_scaled = apply_scaler(train_labels, label_scaler)
val_labels_scaled   = apply_scaler(val_labels, label_scaler)
test_labels_scaled  = apply_scaler(test_labels, label_scaler)

In [14]:
train_features_scaled.head()

Unnamed: 0_level_0,param_1,param_2,param_3,param_4,param_5,param_6,param_7,param_8
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0inch-#2000_csv_Run-1978,0.202015,0.202015,0.399885,0.600115,0.620863,0.160431,0.160431,0.0
0.5inch-#2000_csv_Run-1675,0.240198,0.240198,0.642439,0.35756,0.815345,0.179042,0.179042,0.0
1.0inch-#2000_csv_Run-1806,0.286265,0.286265,0.701312,0.298688,0.852294,0.215236,0.215236,0.0
1.5inch-#2000_csv_Run-1891,0.226914,0.226914,0.287934,0.712066,0.498429,0.065139,0.065139,0.0
1.5inch-#2000_csv_Run-478,0.277137,0.277138,0.461141,0.538859,0.67774,0.063758,0.063758,0.0


In [15]:
train_labels_scaled.head()

Unnamed: 0,col_40,col_46,col_47
0,0.995508,1.0,0.122066
1,0.993052,1.0,0.121121
2,0.990525,1.0,0.121057
3,0.990033,1.0,0.120934
4,0.990525,1.0,0.12094


In [16]:
train_features_scaled.to_parquet('../data/params.train.parquet')
val_features_scaled.to_parquet('../data/params.val.parquet')
test_features_scaled.to_parquet('../data/params.test.parquet')

train_labels_scaled.to_parquet('../data/graphs.train.parquet')
val_labels_scaled.to_parquet('../data/graphs.val.parquet')
test_labels_scaled.to_parquet('../data/graphs.test.parquet')

In [17]:
train_features.to_parquet('../data/raw.params.train.parquet')
val_features.to_parquet('../data/raw.params.val.parquet')
test_features.to_parquet('../data/raw.params.test.parquet')

train_labels.to_parquet('../data/raw.graphs.train.parquet')
val_labels.to_parquet('../data/raw.graphs.val.parquet')
test_labels.to_parquet('../data/raw.graphs.test.parquet')

In [41]:
extsets = [(f,name) for f,name,loca_type in zip(files, names, loca_types)
            if loca_type!='0.5inch' and loca_type!='1.0inch' and loca_type!='1.5inch' and loca_type!='2.0inch' ]
np.random.shuffle(extsets)
print("Number of non-0.5,1.0,1.5 inch data: ", len(extsets))

Number of non-0.5,1.0,1.5 inch data:  404


In [42]:
%time extdata = [load_data(path,name) for path,name in extsets]

CPU times: user 15 s, sys: 2.3 s, total: 17.3 s
Wall time: 17.7 s


In [43]:
extdata = [(x[0], x[1][:10000]) for x in extdata if len(x[1])>=10000]
print(len(extdata))

In [45]:
ext_features = pd.DataFrame([p for p,df in extdata]).set_index('name').add_prefix("param_")
ext_labels   = pd.concat([df for p,df in extdata])

In [46]:
ext_features_scaled = apply_scaler(ext_features, feature_scaler)
ext_labels_scaled = apply_scaler(ext_labels, label_scaler)

In [47]:
ext_features_scaled.to_parquet('../data/params.ext.parquet')
ext_labels_scaled.to_parquet('../data/graphs.ext.parquet')

ext_features.to_parquet('../data/raw.params.ext.parquet')
ext_labels.to_parquet('../data/raw.graphs.ext.parquet')