In [1]:
import os
import pandas as pd

# processed_data 폴더의 경로
processed_data_folder = './processed_data(without preprocess)'

# 각 그룹별로 파일을 로드하기 위한 사전(dictionary) 초기화
grouped_dataframes = {}
file_names = {}  # 각 그룹의 파일 이름을 저장하기 위한 사전

# 폴더 내의 서브 폴더 및 파일 순회
for root, dirs, files in os.walk(processed_data_folder):
    for dir in dirs:
        dir_path = os.path.join(root, dir)
        file_list = [f for f in os.listdir(dir_path) if f.endswith('.csv')]
        dataframes = []
        names = []  # 파일 이름을 저장할 리스트
        for file in file_list:
            file_path = os.path.join(dir_path, file)
            df = pd.read_csv(file_path)
            dataframes.append(df)
            names.append(file)  # 파일 이름 추가
        grouped_dataframes[dir] = dataframes
        file_names[dir] = names  # 파일 이름 리스트를 사전에 추가

In [2]:
file_names

{'2017-05-12': ['processed_FastCharge_000003_CH39_structure.csv',
  'processed_FastCharge_000003_CH40_structure.csv',
  'processed_FastCharge_000004_CH1_structure.csv',
  'processed_FastCharge_000004_CH2_structure.csv',
  'processed_FastCharge_000004_CH3_structure.csv',
  'processed_FastCharge_000008_CH47_structure.csv',
  'processed_FastCharge_000008_CH48_structure.csv',
  'processed_FastCharge_000013_CH13_structure.csv',
  'processed_FastCharge_000013_CH14_structure.csv',
  'processed_FastCharge_000014_CH23_structure.csv',
  'processed_FastCharge_000014_CH24_structure.csv',
  'processed_FastCharge_000018_CH18_structure.csv',
  'processed_FastCharge_000019_CH29_structure.csv',
  'processed_FastCharge_000019_CH30_structure.csv',
  'processed_FastCharge_000023_CH37_structure.csv',
  'processed_FastCharge_000023_CH38_structure.csv',
  'processed_FastCharge_000025_CH7_structure.csv',
  'processed_FastCharge_000026_CH5_structure.csv',
  'processed_FastCharge_000026_CH6_structure.csv',
  'p

In [3]:
# 모델의 입력 : 초기5주기+최근5주기(10)
#               VITC(전압, 전류, 온도, 방전용량, 4)
#               20%에 해당하는 데이터 포인트(200, 1사이클 당 1000개의 데이터 포인트
# 생성되는 데이터는 사이클 전체보다 9개 적음(ex; 1~5 + (5~10, ..., 737~742, 사이클 5부터 시작해 사이클 737까지))
import numpy as np

def preprocess_battery_data(battery_data, future_cycles=10, sample_frac=0.5, random_state=42):
    processed_data = []
    initial_cycles = battery_data[battery_data['cycles_interpolated_cycle_index'] < 5]
    total_cycles = battery_data['cycles_interpolated_cycle_index'].max() + 1

    # initial_samples = [initial_cycles[initial_cycles['cycles_interpolated_cycle_index'] == i].iloc[:]
                       # for i in range(5)]
    initial_samples = [initial_cycles[initial_cycles['cycles_interpolated_cycle_index'] == i].sample(frac=sample_frac, random_state=random_state)
                       for i in range(5)]

    cycle_indices = []

    for cycle_start in range(5, total_cycles - 5 + 1 - future_cycles):
        # recent_samples = [battery_data[battery_data['cycles_interpolated_cycle_index'] == i].iloc[:] 
                          # for i in range(cycle_start, cycle_start + 5)]
        recent_samples = [battery_data[battery_data['cycles_interpolated_cycle_index'] == i].sample(frac=sample_frac, random_state=random_state) 
                          for i in range(cycle_start, cycle_start + 5)]

        if any(len(cycle) < 200 for cycle in recent_samples):
            break

        combined_cycles = initial_samples + recent_samples
        reshaped_data = np.array([cycle[['cycles_interpolated_voltage', 
                                         'cycles_interpolated_current', 
                                         'cycles_interpolated_temperature', 
                                         'cycles_interpolated_discharge_capacity']].values for cycle in combined_cycles]).transpose(1, 2, 0)

        processed_data.append(reshaped_data)
        cycle_indices.append((cycle_start, cycle_start + 4))

    # for start, end in cycle_indices:
        # print(f"File uses cycles from {start} to {end}")

    return np.array(processed_data)

In [4]:
def preprocess_battery_data_for_second_CNN(battery_data, future_cycles=10, sample_frac=0.5, random_state=42):
    processed_data = []
    discharge_data = battery_data[battery_data['cycles_interpolated_step_type'] == 'discharge']
    total_cycles = discharge_data['cycles_interpolated_cycle_index'].max() + 1
    # initial_cycles = discharge_data[discharge_data['cycles_interpolated_cycle_index'] == 2]
    initial_cycles = discharge_data[discharge_data['cycles_interpolated_cycle_index'] == 2].sample(frac=sample_frac, random_state=random_state)
    cycle_indices = []

    for cycle_start in range(5, total_cycles - 5 + 1 - future_cycles):
        # recent_samples = [discharge_data[discharge_data['cycles_interpolated_cycle_index'] == i].iloc[:] 
                          # for i in range(cycle_start, cycle_start + 5)]
        recent_samples = [discharge_data[discharge_data['cycles_interpolated_cycle_index'] == i].sample(frac=sample_frac, random_state=random_state) 
                          for i in range(cycle_start, cycle_start + 5)]

        if any(len(cycle) < 200 for cycle in recent_samples):
            break

        initial_samples_reset = initial_cycles[['cycles_interpolated_voltage', 
                                                'cycles_interpolated_current', 
                                                'cycles_interpolated_temperature', 
                                                'cycles_interpolated_discharge_capacity']].iloc[:].reset_index(drop=True)
                                                # 'cycles_interpolated_temperature', 'cycles_interpolated_discharge_capacity']].iloc[:].reset_index(drop=True)
        differences = []
        for recent_cycle in recent_samples:
            recent_cycle_reset = recent_cycle[['cycles_interpolated_voltage', 'cycles_interpolated_current', 
                                               'cycles_interpolated_temperature', 'cycles_interpolated_discharge_capacity']].reset_index(drop=True)
            recent_cycle_reset['cycles_interpolated_voltage'] = 0 # 차이 계산할때 전압 제거
            difference = initial_samples_reset - recent_cycle_reset
            differences.append(difference.values)

        combined_difference = np.array(differences).transpose(1, 2, 0)

        processed_data.append(combined_difference)
        cycle_indices.append((cycle_start, cycle_start + 4))

    # for start, end in cycle_indices:
        # print(f"File uses cycles from {start} to {end}")

    return np.array(processed_data)

In [5]:
import numpy as np
import os
from tqdm import tqdm

# 처리할 그룹명 지정
selected_groups = ['2017-05-12', '2017-06-30', '2018-04-12']

# 파일 이름과 처리된 데이터를 매핑할 사전 초기화
grouped_processed_data_first_CNN = {}

# 첫 번째 CNN의 훈련 데이터 처리
for group_name in selected_groups[:-1]:  # 마지막 그룹 제외
    if group_name in grouped_dataframes:
        for file_name, battery_data in tqdm(zip(file_names[group_name], grouped_dataframes[group_name])):
            processed_dataset_first_CNN = preprocess_battery_data(battery_data)
            grouped_processed_data_first_CNN[file_name] = processed_dataset_first_CNN
    else:
        print(f"Group {group_name} not found in the dataset")

# 저장할 폴더 생성
save_folder = "processed_datasets(without preprocess)/with_filename(full, sampling)/train/first"
os.makedirs(save_folder, exist_ok=True)

# 처리된 데이터 저장
for file_name, data in grouped_processed_data_first_CNN.items():
    np.save(os.path.join(save_folder, f"{file_name}.npy"), data)

# 저장된 데이터 형태 확인
for file_name, data in grouped_processed_data_first_CNN.items():
    print(f"File: {file_name}, Data shape: {data.shape}")

50it [05:14,  6.29s/it]
43it [02:13,  3.11s/it]


File: processed_FastCharge_000003_CH39_structure.csv, Data shape: (724, 500, 4, 10)
File: processed_FastCharge_000003_CH40_structure.csv, Data shape: (685, 500, 4, 10)
File: processed_FastCharge_000004_CH1_structure.csv, Data shape: (645, 500, 4, 10)
File: processed_FastCharge_000004_CH2_structure.csv, Data shape: (964, 500, 4, 10)
File: processed_FastCharge_000004_CH3_structure.csv, Data shape: (1043, 500, 4, 10)
File: processed_FastCharge_000008_CH47_structure.csv, Data shape: (598, 500, 4, 10)
File: processed_FastCharge_000008_CH48_structure.csv, Data shape: (581, 500, 4, 10)
File: processed_FastCharge_000013_CH13_structure.csv, Data shape: (888, 500, 4, 10)
File: processed_FastCharge_000013_CH14_structure.csv, Data shape: (770, 500, 4, 10)
File: processed_FastCharge_000014_CH23_structure.csv, Data shape: (844, 500, 4, 10)
File: processed_FastCharge_000014_CH24_structure.csv, Data shape: (839, 500, 4, 10)
File: processed_FastCharge_000018_CH18_structure.csv, Data shape: (770, 500, 4

In [6]:
# 파일 이름과 처리된 데이터를 매핑할 사전 초기화
grouped_processed_data_second_CNN = {}

# 두 번째 CNN의 훈련 데이터 처리
for group_name in selected_groups[:-1]:  # 마지막 그룹 제외
    if group_name in grouped_dataframes:
        for file_name, battery_data in tqdm(zip(file_names[group_name], grouped_dataframes[group_name])):
            processed_dataset_second_CNN = preprocess_battery_data_for_second_CNN(battery_data)
            grouped_processed_data_second_CNN[file_name] = processed_dataset_second_CNN
    else:
        print(f"Group {group_name} not found in the dataset")

# 저장할 폴더 생성
save_folder = "processed_datasets(without preprocess)/with_filename(full, sampling)/train/second"
os.makedirs(save_folder, exist_ok=True)

# 처리된 데이터 저장
for file_name, data in grouped_processed_data_second_CNN.items():
    np.save(os.path.join(save_folder, f"{file_name}_second_CNN.npy"), data)

# 저장된 데이터 형태 확인
for file_name, data in grouped_processed_data_second_CNN.items():
    print(f"File: {file_name}, Data shape: {data.shape}")

50it [05:45,  6.91s/it]
43it [02:26,  3.40s/it]


File: processed_FastCharge_000003_CH39_structure.csv, Data shape: (724, 500, 4, 5)
File: processed_FastCharge_000003_CH40_structure.csv, Data shape: (685, 500, 4, 5)
File: processed_FastCharge_000004_CH1_structure.csv, Data shape: (645, 500, 4, 5)
File: processed_FastCharge_000004_CH2_structure.csv, Data shape: (964, 500, 4, 5)
File: processed_FastCharge_000004_CH3_structure.csv, Data shape: (1043, 500, 4, 5)
File: processed_FastCharge_000008_CH47_structure.csv, Data shape: (598, 500, 4, 5)
File: processed_FastCharge_000008_CH48_structure.csv, Data shape: (581, 500, 4, 5)
File: processed_FastCharge_000013_CH13_structure.csv, Data shape: (888, 500, 4, 5)
File: processed_FastCharge_000013_CH14_structure.csv, Data shape: (770, 500, 4, 5)
File: processed_FastCharge_000014_CH23_structure.csv, Data shape: (844, 500, 4, 5)
File: processed_FastCharge_000014_CH24_structure.csv, Data shape: (839, 500, 4, 5)
File: processed_FastCharge_000018_CH18_structure.csv, Data shape: (770, 500, 4, 5)
File: 

In [7]:
import numpy as np
import os
from tqdm import tqdm

# 처리할 그룹명 지정
selected_groups = ['2017-05-12', '2017-06-30', '2018-04-12']

# 파일 이름과 처리된 데이터를 매핑할 사전 초기화
grouped_processed_data_test_first_CNN = {}
grouped_processed_data_test_second_CNN = {}

# 선택된 그룹 중 마지막 그룹을 테스트 데이터로 사용
test_group_name = selected_groups[-1]

if test_group_name in grouped_dataframes:
    # 해당 그룹의 각 배터리 데이터셋에 대해 전처리 수행 (테스트 데이터)
    for file_name, battery_data in tqdm(zip(file_names[test_group_name], grouped_dataframes[test_group_name]), desc=f"Processing {test_group_name}"):
        processed_dataset_first_CNN = preprocess_battery_data(battery_data)
        processed_dataset_second_CNN = preprocess_battery_data_for_second_CNN(battery_data)
        grouped_processed_data_test_first_CNN[file_name] = processed_dataset_first_CNN
        grouped_processed_data_test_second_CNN[file_name] = processed_dataset_second_CNN
else:
    print(f"Test group {test_group_name} not found in the dataset")

# 저장할 폴더 생성
save_folder = "processed_datasets(without preprocess)/with_filename(full, sampling)/"
first_save_folder = save_folder+"test/first"
second_save_folder = save_folder+"test/second"
os.makedirs(first_save_folder, exist_ok=True)
os.makedirs(second_save_folder, exist_ok=True)

# 처리된 데이터 저장
for file_name, data in tqdm(grouped_processed_data_test_first_CNN.items(), desc="Saving first CNN test data"):
    np.save(os.path.join(first_save_folder, f"{file_name}_test_first_CNN.npy"), data)
for file_name, data in tqdm(grouped_processed_data_test_second_CNN.items(), desc="Saving second CNN test data"):
    np.save(os.path.join(second_save_folder, f"{file_name}_test_second_CNN.npy"), data)

# 저장된 데이터 형태 확인
for file_name, data in grouped_processed_data_test_first_CNN.items():
    print(f"File: {file_name}, Test data shape for first CNN: {data.shape}")
for file_name, data in grouped_processed_data_test_second_CNN.items():
    print(f"File: {file_name}, Test data shape for second CNN: {data.shape}")

Processing 2018-04-12: 46it [16:52, 22.02s/it]
Saving first CNN test data: 100%|██████████████████████████████████████████████████████| 46/46 [00:05<00:00,  8.40it/s]
Saving second CNN test data: 100%|█████████████████████████████████████████████████████| 46/46 [00:02<00:00, 16.36it/s]

File: processed_FastCharge_000001_CH16_structure.csv, Test data shape for first CNN: (649, 500, 4, 10)
File: processed_FastCharge_000001_CH30_structure.csv, Test data shape for first CNN: (754, 500, 4, 10)
File: processed_FastCharge_000001_CH38_structure.csv, Test data shape for first CNN: (523, 500, 4, 10)
File: processed_FastCharge_000002_CH10_structure.csv, Test data shape for first CNN: (991, 500, 4, 10)
File: processed_FastCharge_000002_CH18_structure.csv, Test data shape for first CNN: (810, 500, 4, 10)
File: processed_FastCharge_000002_CH2_structure.csv, Test data shape for first CNN: (795, 500, 4, 10)
File: processed_FastCharge_000002_CH34_structure.csv, Test data shape for first CNN: (807, 500, 4, 10)
File: processed_FastCharge_000002_CH42_structure.csv, Test data shape for first CNN: (1266, 500, 4, 10)
File: processed_FastCharge_000002_CH47_structure.csv, Test data shape for first CNN: (1917, 500, 4, 10)
File: processed_FastCharge_000002_CH7_structure.csv, Test data shape for




In [8]:
import numpy as np
import os

def create_and_save_target_data(grouped_dataframes, file_names, selected_groups, save_folder, look_ahead_cycles=19):
    # 파일 이름과 타겟 데이터를 매핑할 사전 초기화
    grouped_target_data = {}

    # 선택된 그룹의 각 배터리 데이터셋에서 타겟 데이터 추출
    for group_name in selected_groups:
        if group_name in grouped_dataframes:
            for file_name, battery_data in zip(file_names[group_name], grouped_dataframes[group_name]):
                target_data = []
                total_cycles = battery_data['cycles_interpolated_cycle_index'].max() + 1
                for cycle_index in range(total_cycles - look_ahead_cycles):
                    future_cycle_index = cycle_index + look_ahead_cycles
                    if future_cycle_index < total_cycles:
                        target_capacity = battery_data[battery_data['cycles_interpolated_cycle_index'] == future_cycle_index].iloc[0]['cycles_interpolated_discharge_capacity']
                        target_data.append(target_capacity)
                grouped_target_data[file_name] = np.array(target_data)
        else:
            print(f"Group {group_name} not found in the dataset")

    # 타겟 데이터 저장
    for file_name, data in grouped_target_data.items():
        np.save(os.path.join(save_folder, f"{file_name}_target.npy"), data)

    return grouped_target_data

In [9]:
save_folder = "processed_datasets(without preprocess)/with_filename(full, sampling)/target/train"
os.makedirs(save_folder, exist_ok=True)

# 훈련 데이터에 대한 타겟 데이터 생성 및 저장
grouped_target_data_train = create_and_save_target_data(grouped_dataframes, file_names, selected_groups[:2], save_folder)

# 저장된 데이터 형태 확인
for file_name, data in grouped_target_data_train.items():
    print(f"File: {file_name}, Target data shape: {data.shape}")

File: processed_FastCharge_000003_CH39_structure.csv, Target data shape: (724,)
File: processed_FastCharge_000003_CH40_structure.csv, Target data shape: (685,)
File: processed_FastCharge_000004_CH1_structure.csv, Target data shape: (645,)
File: processed_FastCharge_000004_CH2_structure.csv, Target data shape: (964,)
File: processed_FastCharge_000004_CH3_structure.csv, Target data shape: (1043,)
File: processed_FastCharge_000008_CH47_structure.csv, Target data shape: (598,)
File: processed_FastCharge_000008_CH48_structure.csv, Target data shape: (581,)
File: processed_FastCharge_000013_CH13_structure.csv, Target data shape: (888,)
File: processed_FastCharge_000013_CH14_structure.csv, Target data shape: (770,)
File: processed_FastCharge_000014_CH23_structure.csv, Target data shape: (844,)
File: processed_FastCharge_000014_CH24_structure.csv, Target data shape: (839,)
File: processed_FastCharge_000018_CH18_structure.csv, Target data shape: (770,)
File: processed_FastCharge_000019_CH29_str

In [10]:
save_folder = "processed_datasets(without preprocess)/with_filename(full, sampling)/target/test"
os.makedirs(save_folder, exist_ok=True)

# 훈련 데이터에 대한 타겟 데이터 생성 및 저장
grouped_target_data_train = create_and_save_target_data(grouped_dataframes, file_names, selected_groups[2:], save_folder)

# 저장된 데이터 형태 확인
for file_name, data in grouped_target_data_train.items():
    print(f"File: {file_name}, Target data shape: {data.shape}")

File: processed_FastCharge_000001_CH16_structure.csv, Target data shape: (649,)
File: processed_FastCharge_000001_CH30_structure.csv, Target data shape: (754,)
File: processed_FastCharge_000001_CH38_structure.csv, Target data shape: (523,)
File: processed_FastCharge_000002_CH10_structure.csv, Target data shape: (991,)
File: processed_FastCharge_000002_CH18_structure.csv, Target data shape: (810,)
File: processed_FastCharge_000002_CH2_structure.csv, Target data shape: (795,)
File: processed_FastCharge_000002_CH34_structure.csv, Target data shape: (807,)
File: processed_FastCharge_000002_CH42_structure.csv, Target data shape: (1266,)
File: processed_FastCharge_000002_CH47_structure.csv, Target data shape: (1917,)
File: processed_FastCharge_000002_CH7_structure.csv, Target data shape: (1028,)
File: processed_FastCharge_000006_CH11_structure.csv, Target data shape: (1045,)
File: processed_FastCharge_000006_CH19_structure.csv, Target data shape: (1021,)
File: processed_FastCharge_000006_CH2