In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pyarrow as pa
import pyarrow.parquet as pq
import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Loading Train file first 10k rows for getting the dtypes of all columns

In [None]:
train_path = '/kaggle/input/amex-default-prediction/train_data.csv'
parquet_file = 'train_red_mem_32.parquet'
train = pd.read_csv(train_path, nrows=10000)

In [None]:
def get_col_dtypes(data):
    col_dtypes = {}
    for col in data.columns:
        if str(data[col].dtype) == 'float64':
            col_dtypes[col] = 'float32'
        elif str(data[col].dtype) == 'int64':
            col_dtypes[col] = 'int32'
        elif str(data[col].dtype) == 'object':
            col_dtypes[col] = 'category'
        else:
            col_dtypes[col] = str(data[col].dtype)
    return col_dtypes

#### Casting all float64 to 32, int64 to 32 and object columns to category

In [None]:
col_dtypes = get_col_dtypes(train)

In [None]:
def write_parquet(path, save_loc, col_dtypes, chunk_size=10000):
    schema = ''
    writer = ''
    for i,chunk in enumerate(pd.read_csv(path, dtype=col_dtypes, iterator=True, chunksize=10000)):
        if i == 0:
            schema = pa.Table.from_pandas(df=chunk).schema
            writer = pq.ParquetWriter(save_loc, schema, compression='snappy')
        table = pa.Table.from_pandas(chunk, schema=schema)
        writer.write_table(table)
    return None

#### Converting csv to praquet file

In [None]:
start = time.time()
write_parquet(train_path, parquet_file, col_dtypes)
end = time.time()
minutes, seconds = divmod(end-start, 60)
print(f"Time taken for converting csv to parquet: {minutes} minutes and {seconds} seconds....")

In [None]:
train_new = pd.read_parquet(parquet_file)