# OUO Prepare EDC Data

Katherine Goode  

This document contains code for preparing the H-CT simulated data to be used for the analysis in the jfPCA-PFI paper.

In [1]:
from datetime import date
print("Last updated:", date.today())

Last updated: 2022-01-17


In [2]:
# Load packages
import fdasrsf as fs
import numpy as np
import pandas as pd
import pickle
import random

## Load Data

In [3]:
data = np.load("../data/hct_raw/extracted_curves.npz")
data.files

['f_h2o',
 'f_rdx',
 'f_hp100',
 'f_hp90',
 'f_hp80',
 'f_hp70',
 'f_hp60',
 'f_hp50',
 'f_hp40',
 'f_hp30',
 'f_hp20',
 'f_hp10',
 'f_acryl',
 'f_teflon',
 'f_slt',
 'f_phen',
 'f_nylon',
 'f_nylatron',
 'f_mg',
 'f_lexan',
 'f_delrin',
 'f_al',
 'time',
 'h2o_lbl',
 'rdx_lbl',
 'hp100_lbl',
 'hp90_lbl',
 'hp80_lbl',
 'hp70_lbl',
 'hp60_lbl',
 'hp50_lbl',
 'hp40_lbl',
 'hp30_lbl',
 'hp20_lbl',
 'hp10_lbl',
 'acryl_lbl',
 'teflon_lbl',
 'slt_lbl',
 'phen_lbl',
 'nylon_lbl',
 'nylatron_lbl',
 'mg_lbl',
 'lexan_lbl',
 'delrin_lbl']

## Clean Data and Train/Test Split

In [4]:
# Determine dimensions of the materials of interest
print(data["f_h2o"].shape)
print(data["f_rdx"].shape)
print(data["f_hp100"].shape)
print(data["f_hp50"].shape)
print(data["f_hp10"].shape)

(128, 674073)
(128, 668298)
(128, 338533)
(128, 157539)
(128, 147851)


In [106]:
# Determine which materials and how many functions within a material have all 0s
h2o_zeros = np.where(~data["f_h2o"].any(axis=0))[0].shape[0]
rdx_zeros = np.where(~data["f_rdx"].any(axis=0))[0].shape[0]
hp100_zeros = np.where(~data["f_hp100"].any(axis=0))[0].shape[0]
hp50_zeros = np.where(~data["f_hp50"].any(axis=0))[0].shape[0]
hp10_zeros = np.where(~data["f_hp10"].any(axis=0))[0].shape[0]
print("H2O: " + str(h2o_zeros))
print("RDX: " + str(rdx_zeros))
print("HP100: " + str(hp100_zeros))
print("HP50: " + str(hp50_zeros))
print("HP10: " + str(hp10_zeros))
print("Total: " + str(h2o_zeros + rdx_zeros + hp100_zeros + hp50_zeros + hp10_zeros))

H2O: 0
RDX: 0
HP100: 0
HP50: 3015
HP10: 2870
Total: 5885


In [94]:
# Remove rows with all zeros
data_f_hp50_no_zeros = data["f_hp50"][:,~np.all(data["f_hp50"] == 0, axis=0)]
data_f_hp10_no_zeros = data["f_hp10"][:,~np.all(data["f_hp10"] == 0, axis=0)]

In [95]:
# Function for preparing the data frame for a material
def prepare_dfs(data_raw, material, train_prop, seed):
    
    # Traspose and convert to a data frame
    data = pd.DataFrame(data_raw.transpose())
   
    # Label the material
    data['material'] = material
    
    # Split into train and test sets
    train = data.sample(frac = train_prop, random_state = seed)
    test = data.drop(train.index)
    train['dataset'] = "train"
    test['dataset'] = "test"
    
    # Return the data
    return pd.concat([train, test])

In [96]:
# Prepare the material data frames (for hp50 and hp10, only include rows with non-zero values)
df_h2o = prepare_dfs(data["f_h2o"], "h2o", 0.8, 2021)
df_exp = prepare_dfs(data["f_rdx"], "explosive", 0.8, 2021)
df_hp100 = prepare_dfs(data["f_hp100"], "hp100", 0.8, 2021)
df_hp50 = prepare_dfs(data_f_hp50_no_zeros, "hp50", 0.8, 2021)
df_hp10 = prepare_dfs(data_f_hp10_no_zeros, "hp10", 0.8, 2021)

In [97]:
# Join the material data frames
df_hct = pd.concat([df_h2o, df_exp, df_hp100, df_hp50, df_hp10])
df_hct['id'] = range(1, df_hct.shape[0] + 1)
df_hct

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,121,122,123,124,125,126,127,material,dataset,id
262644,0.010923,0.011853,0.010923,0.010923,0.010923,0.010923,0.010923,0.010923,0.011690,0.023156,...,0.012973,0.021373,0.016898,0.017258,0.008628,0.015651,0.014590,h2o,train,1
620816,0.009764,0.010560,0.009764,0.009764,0.009764,0.009764,0.009764,0.009764,0.009764,-0.015494,...,0.013476,0.011841,0.023680,0.004918,0.019260,0.009620,0.002493,h2o,train,2
449609,0.011744,0.012373,0.011744,0.011744,0.011744,0.011744,0.011744,0.011744,0.012579,0.018720,...,0.011406,0.011894,0.007976,0.013348,0.005521,0.013274,0.024924,h2o,train,3
184706,0.016043,0.017146,0.016043,0.016043,0.016043,0.016043,0.016043,0.016043,0.016043,0.033334,...,0.012249,0.008025,0.022068,0.013417,0.012072,0.019976,0.013151,h2o,train,4
167802,0.014890,0.015675,0.014890,0.014890,0.014890,0.014890,0.014890,0.014890,0.014890,-0.002147,...,0.017573,0.011615,0.013764,0.015336,0.022390,0.009379,0.012749,h2o,train,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144953,0.009811,0.010419,0.009811,0.009811,0.009811,0.009811,0.009811,0.009811,0.009713,-0.055027,...,0.020907,0.032344,0.025010,-0.000169,0.012652,0.007830,0.014222,hp10,test,1980405
144960,0.009810,0.010385,0.009810,0.009810,0.009810,0.009810,0.009810,0.009810,0.009712,0.009234,...,0.008773,0.024287,0.017653,0.012189,0.007298,0.018862,0.009252,hp10,test,1980406
144970,0.009815,0.010392,0.009815,0.009815,0.009815,0.009815,0.009815,0.009815,0.009717,0.026272,...,0.007851,0.010493,0.025821,0.020036,0.019343,-0.000997,0.015072,hp10,test,1980407
144971,0.009815,0.010374,0.009815,0.009815,0.009815,0.009815,0.009815,0.009815,0.009717,0.028076,...,0.008619,0.009297,0.021775,0.017380,0.022831,0.026201,0.018316,hp10,test,1980408


In [98]:
# Change to a long format
df_hct_melted = df_hct.melt(
    id_vars = ["id", "dataset", "material"], 
    var_name = "frequency", 
    value_name = "value"
)
df_hct_melted

Unnamed: 0,id,dataset,material,frequency,value
0,1,train,h2o,0,0.010923
1,2,train,h2o,0,0.009764
2,3,train,h2o,0,0.011744
3,4,train,h2o,0,0.016043
4,5,train,h2o,0,0.014890
...,...,...,...,...,...
253492347,1980405,test,hp10,127,0.014222
253492348,1980406,test,hp10,127,0.009252
253492349,1980407,test,hp10,127,0.015072
253492350,1980408,test,hp10,127,0.018316


In [99]:
# Add times to the data
freq_norm = data['time']
df_hct_melted['frequency_norm'] = freq_norm.repeat(df_hct.shape[0])
df_hct_melted

Unnamed: 0,id,dataset,material,frequency,value,frequency_norm
0,1,train,h2o,0,0.010923,0.0
1,2,train,h2o,0,0.009764,0.0
2,3,train,h2o,0,0.011744,0.0
3,4,train,h2o,0,0.016043,0.0
4,5,train,h2o,0,0.014890,0.0
...,...,...,...,...,...,...
253492347,1980405,test,hp10,127,0.014222,1.0
253492348,1980406,test,hp10,127,0.009252,1.0
253492349,1980407,test,hp10,127,0.015072,1.0
253492350,1980408,test,hp10,127,0.018316,1.0


In [100]:
# Reorder the columns
df_hct_ordered = df_hct_melted[['dataset','id','material','frequency','frequency_norm','value']]
df_hct_ordered

Unnamed: 0,dataset,id,material,frequency,frequency_norm,value
0,train,1,h2o,0,0.0,0.010923
1,train,2,h2o,0,0.0,0.009764
2,train,3,h2o,0,0.0,0.011744
3,train,4,h2o,0,0.0,0.016043
4,train,5,h2o,0,0.0,0.014890
...,...,...,...,...,...,...
253492347,test,1980405,hp10,127,1.0,0.014222
253492348,test,1980406,hp10,127,1.0,0.009252
253492349,test,1980407,hp10,127,1.0,0.015072
253492350,test,1980408,hp10,127,1.0,0.018316


In [None]:
# Sort the data more nicely
df_hct_clean = df_hct_ordered.sort_values(by = ['dataset', 'id', 'frequency'])
df_hct_clean

## Save Data

In [12]:
# Save the data
pickle.dump(df_hct_clean, open('../data/hct-clean.pkl', 'wb'))