In [3]:
import pandas as pd
import os

load_path = './Dataset/df_processed.parquet'
output_dir = './Dataset/Divide'

os.makedirs(output_dir, exist_ok=True) # 출력 폴더 생성 (존재하지 않으면 생성)

print(f"Loading data from {load_path}...")

# 데이터 불러오기
df_processed = pd.read_parquet(load_path, engine='fastparquet', filters=[('Trigger', '!=', 'others')], columns=['day', 'minute', 'HashApp', 'HashFunction', 'invocations'])
print("Data loaded successfully.")

print(f"Data shape: {df_processed.shape}")
print(df_processed.head())

Loading data from ./Dataset/df_processed.parquet...
Data loaded successfully.
Data shape: (890704800, 5)
   day  minute                                            HashApp  \
0    8       1  000143d27de4ac74f2c7d579294aef2950317dfffdaae9...   
1    8       2  000143d27de4ac74f2c7d579294aef2950317dfffdaae9...   
2    8       3  000143d27de4ac74f2c7d579294aef2950317dfffdaae9...   
3    8       4  000143d27de4ac74f2c7d579294aef2950317dfffdaae9...   
4    8       5  000143d27de4ac74f2c7d579294aef2950317dfffdaae9...   

                                        HashFunction  invocations  
0  d9272e4b43e348badb6af384efcc10835704037e1cf1d2...            0  
1  d9272e4b43e348badb6af384efcc10835704037e1cf1d2...            0  
2  d9272e4b43e348badb6af384efcc10835704037e1cf1d2...            0  
3  d9272e4b43e348badb6af384efcc10835704037e1cf1d2...            0  
4  d9272e4b43e348badb6af384efcc10835704037e1cf1d2...            0  


In [4]:
# 4개의 파일로 분리하기
num_parts = 4
part_size = len(df_processed) // num_parts
for i in range(num_parts):
    start_idx = i * part_size
    end_idx = (i + 1) * part_size if i < num_parts - 1 else len(df_processed)
    df_part = df_processed.iloc[start_idx:end_idx]
    part_path = f"{output_dir}/df_part_{i+1}.parquet"
    
    # Parquet 형식으로 저장
    df_part.to_parquet(part_path, engine='pyarrow', index=False)
    print(f"Saved part {i+1} to {part_path}, shape: {df_part.shape}")

Saved part 1 to ./Dataset/Divide/df_part_1.parquet, shape: (222676200, 5)
Saved part 2 to ./Dataset/Divide/df_part_2.parquet, shape: (222676200, 5)
Saved part 3 to ./Dataset/Divide/df_part_3.parquet, shape: (222676200, 5)
Saved part 4 to ./Dataset/Divide/df_part_4.parquet, shape: (222676200, 5)
