In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from glob import glob

In [2]:
files = glob('multics/data/*.txt')
files

['multics/data/EPS1.txt',
 'multics/data/VS1.txt',
 'multics/data/PS2.txt',
 'multics/data/PS6.txt',
 'multics/data/CP.txt',
 'multics/data/SE.txt',
 'multics/data/TS1.txt',
 'multics/data/PS1.txt',
 'multics/data/PS4.txt',
 'multics/data/documentation.txt',
 'multics/data/profile.txt',
 'multics/data/TS4.txt',
 'multics/data/TS3.txt',
 'multics/data/CE.txt',
 'multics/data/description.txt',
 'multics/data/FS1.txt',
 'multics/data/PS5.txt',
 'multics/data/PS3.txt',
 'multics/data/TS2.txt',
 'multics/data/FS2.txt']

In [3]:
df_dict = dict()
for file in files:
    try:
        df_dict[file.split('.')[0].split('/')[-1]] = pd.read_csv(file, sep='\t', header=None)
    except:
        print(f'fail to load {file}.')

fail to load multics/data/documentation.txt.
fail to load multics/data/description.txt.


In [4]:
df_dict.keys()

dict_keys(['EPS1', 'VS1', 'PS2', 'PS6', 'CP', 'SE', 'TS1', 'PS1', 'PS4', 'profile', 'TS4', 'TS3', 'CE', 'FS1', 'PS5', 'PS3', 'TS2', 'FS2'])

In [5]:
profile = df_dict.pop('profile')
profile.columns = ['Cooler condition', 'Valve condition', 'Internal pump leakage', 'Hydraulic accumulator', 'stable flag']
profile = profile.assign(lot_index=lambda df: np.arange(df.shape[0]))

In [6]:
for key in df_dict.keys():
    df_dict[key].index.name = 'lot index'

In [7]:
profile.head()

Unnamed: 0,Cooler condition,Valve condition,Internal pump leakage,Hydraulic accumulator,stable flag,lot_index
0,3,100,0,130,1,0
1,3,100,0,130,1,1
2,3,100,0,130,1,2
3,3,100,0,130,1,3
4,3,100,0,130,1,4


In [8]:
df_melt_list = []
for key in df_dict.keys():
    df = df_dict[key].assign(lot_index=lambda x: x.index)
    df_melt = pd.melt(df, id_vars='lot_index', var_name='time_stamp').assign(var_name=key)
    df_melt_list.append(df_melt)
df_melt = pd.concat(df_melt_list)

In [9]:
del df_dict

In [10]:
df_all = pd.merge(profile, df_melt, right_on='lot_index', left_on='lot_index')

In [11]:
df_all.head()

Unnamed: 0,Cooler condition,Valve condition,Internal pump leakage,Hydraulic accumulator,stable flag,lot_index,time_stamp,value,var_name
0,3,100,0,130,1,0,0,2411.6,EPS1
1,3,100,0,130,1,0,1,2411.6,EPS1
2,3,100,0,130,1,0,2,2411.6,EPS1
3,3,100,0,130,1,0,3,2411.6,EPS1
4,3,100,0,130,1,0,4,2411.6,EPS1


In [12]:
del profile
del df_melt

In [13]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [15]:
df_all_comp, NAlist = reduce_mem_usage(df_all)

Memory usage of properties dataframe is : 7348.20556640625  MB
******************************
Column:  Cooler condition
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  Valve condition
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  Internal pump leakage
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  Hydraulic accumulator
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  stable flag
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  lot_index
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  value
dtype before:  float64
dtype after:  float32
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory

In [16]:
pd.to_pickle(df_all_comp, 'multics/data/data.pkl')

In [17]:
NAlist

[]