In [25]:
import os
import pandas as pd

dtype_mapping = {f"winner_card_{i}_id": "int32" for i in range(1, 9)}
dtype_mapping.update({f"loser_card_{i}_id": "int32" for i in range(1, 9)})
dtype_mapping.update({f"winner_card_{i}_level": "int8" for i in range(1, 9)})
dtype_mapping.update({f"loser_card_{i}_level": "int8" for i in range(1, 9)})
dtype_mapping.update({
    "winner_tower_card_id": "int32",
    "winner_tower_card_level": "int8",
    "loser_tower_card_id": "int32",
    "loser_tower_card_level": "int8",
})


# Get the list of CSV files in the directory
csv_files = [file for file in os.listdir('./clash_battles') if file.endswith('.csv')]

# Read and concatenate the CSV files
dfs = []
for file in csv_files:
    df = pd.read_csv(os.path.join('./clash_battles', file), dtype=dtype_mapping)
    dfs.append(df)
concatenated_df = pd.concat(dfs)
concatenated_df.drop_duplicates(inplace=True)
concatenated_df["winner"] = concatenated_df["winner"].str.strip('#').astype("string")
concatenated_df["loser"] = concatenated_df["loser"].str.strip('#').astype("string")
concatenated_df["battleTime"] = pd.to_datetime(concatenated_df["battleTime"], format="%Y%m%dT%H%M%S.%fZ")
concatenated_df["battleType"] = concatenated_df["battleType"].astype("category")
concatenated_df["battleMode"] = concatenated_df["battleMode"].astype("category")

# Save the concatenated DataFrame as a zstd compressed Parquet file
concatenated_df.to_parquet('full_clash_battles_zstd.parquet', compression="zstd")

In [26]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2299510 entries, 0 to 80320
Data columns (total 42 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   battleTime               datetime64[ns]
 1   battleType               category      
 2   battleMode               category      
 3   arena                    int64         
 4   winner                   string        
 5   loser                    string        
 6   winner_card_1_id         int32         
 7   winner_card_1_level      int8          
 8   winner_card_2_id         int32         
 9   winner_card_2_level      int8          
 10  winner_card_3_id         int32         
 11  winner_card_3_level      int8          
 12  winner_card_4_id         int32         
 13  winner_card_4_level      int8          
 14  winner_card_5_id         int32         
 15  winner_card_5_level      int8          
 16  winner_card_6_id         int32         
 17  winner_card_6_level      int8     