In [None]:
import pandas as pd
import numpy as np
import gc

from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
ROOT_DIR = Path("../input/amex-default-prediction")
TRAIN_CSV = Path("train_data.csv")
TRAIN_LABEL_CSV = Path("train_labels.csv")
TEST_CSV = Path('test_data.csv')

In [None]:
df = pd.read_csv(ROOT_DIR / TRAIN_CSV, chunksize=1)
cat = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cont_vars = list(set(df.__next__().columns).difference(set(cat)))
cont_vars.remove('customer_ID')
cont_vars.remove('S_2')

In [None]:
def convert_to_parquet(input_path, output_path, chunksize = 15000):
   
    pq_writer = None
    
    for idx, df_chunk in enumerate(pd.read_csv(input_path, chunksize=chunksize)):
        print(f"id: {idx} Chunk size {df_chunk.shape}")
        df_chunk[cont_vars] = df_chunk[cont_vars].astype('float32')

        # change the hash string to integer. Reference :https://www.kaggle.com/competitions/amex-default-prediction/discussion/328054
        #df_chunk.customer_ID = df_chunk.customer_ID.apply(lambda x: int(x[-16:],16)).astype('int64')

        #convert categorical data to type categorical in pandas.
        df_chunk[cat] = df_chunk[cat].astype('category')

        df_chunk['S_2'] = pd.to_datetime(df_chunk['S_2'])

        table = pa.Table.from_pandas(df_chunk)
        if idx == 0:
            pq_writer = pq.ParquetWriter(output_path, table.schema, compression = 'snappy')
        
        pq_writer.write_table(table)
        

        # Removing current chunk from meory to free up memory
        del df_chunk
        del table
        gc.collect()
        
    del df
    if pq_writer:
        pq_writer.close()
    gc.collect()
    
    

In [None]:
convert_to_parquet(input_path=  ROOT_DIR / TRAIN_CSV, output_path='./train.parquet')

In [None]:
convert_to_parquet(input_path=  ROOT_DIR / TRAIN_CSV, output_path='./test.parquet')

In [None]:
train_label_df = pd.read_csv(ROOT_DIR / TRAIN_LABEL_CSV)
#train_label_df.customer_ID = train_label_df.customer_ID.apply(lambda x: int(x[-16:],16)).astype('int64')
train_label_df.to_parquet('./train_label.parquet')