# Read Multiple Data Files, Transform and Write into a single file

In [None]:
# Uncomment the below line to Install any required packages
# !pip install tqdm h5py natsort pympler hdf5storage

In [5]:
# Import necessary libraries
import pickle as pkl
from tqdm import tqdm
from glob import glob
from pympler import asizeof
import torch
import pandas as pd
import numpy as np 
from collections import OrderedDict
import h5py
from datetime import datetime
from natsort import natsorted
import gzip
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Set the path to the dataset
datasetspath = "/Users/saransathy/WalshDBA/Capstone"

In [None]:
# Convert the Raw_old.mat file to MAT 7.3 format
#import scipy.io
#import hdf5storage
#import os

# Rename the Folder16 file in Drive & convert to MAT 7.3 format for uniformity
# os.rename(f"{datasetspath}/Drive/Folder16/Raw.mat", f"{datasetspath}/Drive/Folder16/Raw_old.mat")

# Step 1: Load the .mat file using scipy.io
#data = scipy.io.loadmat(f"{datasetspath}/Drive/Folder16/Raw_old.mat")

# Step 2: Remove internal keys added by scipy
#data_clean = {k: v for k, v in data.items() if not k.startswith('__')}

# Step 3: Save as .mat v7.3 (HDF5) file
#hdf5storage.savemat(f"{datasetspath}/Drive/Folder16/Raw.mat", data_clean, format='7.3')

# Step 4: Cleanup the variables
#del data, data_clean

## Read & Load Audi eTron Data Files

In [10]:
# Function to extract data from the h5py file
def extract_group(f, path):
    if isinstance(f[path], h5py.Group):
        return {k: extract_group(f, f"{path}/{k}") for k in f[path].keys()}
    else:
        return np.array(f[path])
    
# Function to identify series break and return corresponding epoc
def series_break(data, timeepoch):
    prev = data[0]
    i = 1
    ti = 0
    idxlist = [(0,0)]
    while i < len(data):
        if (prev > data[i]):
            while ti < len(timeepoch):
                if (prev < timeepoch[ti]):
                    idxlist.append((i,ti))
                    break
                ti+=1
        prev = data[i]
        i += 1
    idxlist.append((i, len(timeepoch)))
    #print("Total Sets: ",idxlist)
    return(idxlist)
    
def merge_etron_data_files(path):
    # Process the data files
    data_files = natsorted(glob(f"{datasetspath}/{path}/*/Raw.mat"))
    #data_files = [f"{datasetspath}/Drive/Folder12/Raw.mat"]
    print(f"Reading and Processing Data files from {datasetspath}/{path}/")
    dataset = OrderedDict((key,[]) for key in ['TimeCurr','Curr','TimeVolt','Volt','TimeSoC','SoC','TimeTemp','Temp'])
    for each_path in tqdm(data_files):
        # Load the .mat file
        with h5py.File(each_path, 'r') as file:
            # Extract the relevant data
            data = extract_group(file, 'Raw')
        
        for key in ['Curr','Volt','SoC','Temp']:
            idxs = series_break(data[f"Time{key}"][0],data['TimeEpoch'][0])
            i = 0
            while i < len(idxs)-1:
                fdi, fti = idxs[i]
                ldi, lti = idxs[i+1]
                dataset[key].extend(data[key][0][fdi:ldi])
                dataset[f"Time{key}"].extend(data[f"Time{key}"][0][fdi:ldi]+data['Epoch'][0][fti])
                i = i+1
                    
        del data
    print(f"Merging and Writing data to {datasetspath}/{path}.pkl")
    dfdict = {}
    for key in dataset.keys():
        if (key.startswith('Time')):
            continue
        timekey = f"Time{key}"
        df = pd.DataFrame({timekey: dataset[timekey], key: dataset[key]})
        df[timekey] = pd.to_datetime(df[timekey],unit='s')
        df.set_index(timekey, inplace=True)
        dfdict[key] = df
        del df,timekey
    #data_series = {k: pd.Series(v, name=k) for k, v in dataset.items()}
    pd.to_pickle(dfdict, f"{datasetspath}/{path}.pkl")
    del data_files, dfdict
    return

In [11]:
# Process and Merge data files into Pandas Data Series
merge_etron_data_files('Charge')
merge_etron_data_files('Drive')

Reading and Processing Data files from /Users/saransathy/WalshDBA/Capstone/Charge/


100%|█████████████████████████████████████████████| 8/8 [00:38<00:00,  4.77s/it]


Merging and Writing data to /Users/saransathy/WalshDBA/Capstone/Charge.pkl
Reading and Processing Data files from /Users/saransathy/WalshDBA/Capstone/Drive/


100%|█████████████████████████████████████████████| 8/8 [00:14<00:00,  1.78s/it]


Merging and Writing data to /Users/saransathy/WalshDBA/Capstone/Drive.pkl


## Read and Load Multivehicle Dataset

In [7]:
# Function to read multivehicle data files
def merge_mvdata_files(path):
    
    data_files = natsorted(glob(f"{datasetspath}/{path}/data/*.pkl"))
    print(f"Reading and Processing Data files from {datasetspath}/{path}/")
    
    #  Initialize Keys
    metakeys = ['label','mileage','capacity','car','charge_segment']
    ckeys = ['Curr','SoC','Temp','Volt','MaxVolt','MinVolt','MinTemp']
    ckeysmap = {"Volt":0, "Curr":1, "MaxVolt":2, "MinVolt":3, "MinTemp":4, "Temp":5, "SoC":6}
    
    datadict = OrderedDict((key,[]) for key in metakeys+ckeys)
    
    for each_path in tqdm(data_files):

        # Load the pickle file
        # Use 'rb' mode to read the file in binary format
        with open(each_path, 'rb') as file:
            # Load the data using torch.load
            # Set weights_only=False to load the entire file
            this_pkl_file = torch.load(file, weights_only=False)

        # Assuming the pickle file contains a tuple with data and metadata
        metadata = this_pkl_file[1]
        timedata = this_pkl_file[0]

        # Append data to the datadict
        for key in metakeys:
            datadict[key].extend([metadata[key]] * 128)
        for key in ckeys:
            datadict[key].extend(timedata[:, ckeysmap[key]].tolist())

        # Release the memory
        del this_pkl_file,metadata,timedata

    # Write to pickle file
    print(f"Merging and Writing data to {datasetspath}/{path}.pkl")
    df = pd.DataFrame(datadict)
    df.to_pickle(f"{datasetspath}/{path}.pkl")
    del data_files,datadict,df
    return 

In [8]:
merge_mvdata_files('battery_dataset1')
merge_mvdata_files('battery_dataset2')
merge_mvdata_files('battery_dataset3')

Reading and Processing Data files from /Users/saransathy/WalshDBA/Capstone/battery_dataset1/


100%|██████████| 629121/629121 [03:21<00:00, 3125.42it/s]


Merging and Writing data to /Users/saransathy/WalshDBA/Capstone/battery_dataset1.pkl
Reading and Processing Data files from /Users/saransathy/WalshDBA/Capstone/battery_dataset2/


100%|██████████| 472829/472829 [02:00<00:00, 3926.66it/s]


Merging and Writing data to /Users/saransathy/WalshDBA/Capstone/battery_dataset2.pkl
Reading and Processing Data files from /Users/saransathy/WalshDBA/Capstone/battery_dataset3/


100%|██████████| 176327/176327 [00:38<00:00, 4593.86it/s]


Merging and Writing data to /Users/saransathy/WalshDBA/Capstone/battery_dataset3.pkl
