In [None]:
import os
import gc
import pandas as pd
from humanize import naturalsize

In [None]:
data_dir = '/kaggle/input/amex-default-prediction'

for file in os.listdir(data_dir):
    size = os.path.getsize(os.path.join(data_dir, file))
    size = naturalsize(size)
    print('{}: {}'.format(file, size))

In [None]:
%%time

train_df = pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv', nrows=1000)

In [None]:
naturalsize(train_df.memory_usage(deep=True).sum())

In [None]:
all_cols = train_df.columns.to_list()
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

dtype = {col: 'float16' for col in all_cols if col not in cat_cols + ['customer_ID', 'S_2']}

for col in cat_cols + ['customer_ID']:
    dtype[col] = 'category'
    

In [None]:
%%time

train_df = pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv', nrows=1000, dtype=dtype)
train_df.S_2 = pd.to_datetime(train_df.S_2)

In [None]:
naturalsize(train_df.memory_usage(deep=True).sum())

In [None]:
del train_df
gc.collect()

# Train data

In [None]:
def process(df):
    
    df.S_2 = pd.to_datetime(df.S_2)
    
    return df

In [None]:
%%time

FILE = '/kaggle/input/amex-default-prediction/train_data.csv'
CHUNKSIZE = 400000
train_df = pd.DataFrame([])

with pd.read_csv(FILE, chunksize=CHUNKSIZE, dtype=dtype) as reader:
    for i, chunk in enumerate(reader):
        print('processing chunk {}'.format(i + 1))
        train_df = train_df.append(process(chunk), ignore_index=True)

In [None]:
naturalsize(train_df.memory_usage(deep=True).sum())

In [None]:
%%time

train_df.to_pickle('train_data.pkl')

In [None]:
del train_df
gc.collect()

# Test data

In [None]:
%%time

FILE = '/kaggle/input/amex-default-prediction/test_data.csv'
CHUNKSIZE = 400000
test_df = pd.DataFrame([])

with pd.read_csv(FILE, chunksize=CHUNKSIZE, dtype=dtype) as reader:
    for i, chunk in enumerate(reader):
        print('processing chunk {}'.format(i + 1))
        test_df = test_df.append(process(chunk), ignore_index=True)

In [None]:
naturalsize(test_df.memory_usage(deep=True).sum())

In [None]:
%%time

test_df.to_pickle('test_data.pkl')

In [None]:
del test_df
gc.collect()