**By the end of this notebook, you should be able to read the train and test data in ~45 seconds**

The various steps followed in this kernel are given below:

    1. Read the csv files by explicitly specifying the required column datatype.
    2. The `customer_ID` column takes a lot of memory. So a `new_customer_id` column is created to reduce memory usage.
    3. The processed data is saved in feather format.
    4. For quick future usage, the data is uploaded as private kaggle dataset using the kaggle API

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import gc

pd.set_option("max_rows", 500)
pd.set_option("max_columns", None)

input_path = Path('/kaggle/input/amex-default-prediction/')

In [None]:
DATA_DIR = '../working/data/'
!mkdir -p {DATA_DIR}

In [None]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
col_dtypes = {'P_2': np.dtype('float16'), 'D_39': np.dtype('float16'), 'B_1': np.dtype('float16'), 'B_2': np.dtype('float16'), 'R_1': np.dtype('float16'), 'S_3': np.dtype('float16'), 'D_41': np.dtype('float16'), 'B_3': np.dtype('float16'), 'D_42': np.dtype('float16'), 'D_43': np.dtype('float16'), 'D_44': np.dtype('float16'), 'B_4': np.dtype('float16'), 'D_45': np.dtype('float16'), 'B_5': np.dtype('float16'), 'R_2': np.dtype('float16'), 'D_46': np.dtype('float16'), 'D_47': np.dtype('float16'), 'D_48': np.dtype('float16'), 'D_49': np.dtype('float16'), 'B_6': np.dtype('float16'), 'B_7': np.dtype('float16'), 'B_8': np.dtype('float16'), 'D_50': np.dtype('float16'), 'D_51': np.dtype('float16'), 'B_9': np.dtype('float16'), 'R_3': np.dtype('float16'), 'D_52': np.dtype('float16'), 'P_3': np.dtype('float16'), 'B_10': np.dtype('float16'), 'D_53': np.dtype('float16'), 'S_5': np.dtype('float16'), 'B_11': np.dtype('float16'), 'S_6': np.dtype('float16'), 'D_54': np.dtype('float16'), 'R_4': np.dtype('float16'), 'S_7': np.dtype('float16'), 'B_12': np.dtype('float16'), 'S_8': np.dtype('float16'), 'D_55': np.dtype('float16'), 'D_56': np.dtype('float16'), 'B_13': np.dtype('float16'), 'R_5': np.dtype('float16'), 'D_58': np.dtype('float16'), 'S_9': np.dtype('float16'), 'B_14': np.dtype('float16'), 'D_59': np.dtype('float16'), 'D_60': np.dtype('float16'), 'D_61': np.dtype('float16'), 'B_15': np.dtype('float16'), 'S_11': np.dtype('float16'), 'D_62': np.dtype('float16'), 'D_65': np.dtype('float16'), 'B_16': np.dtype('float16'), 'B_17': np.dtype('float16'), 'B_18': np.dtype('float16'), 'B_19': np.dtype('float16'), 'B_20': np.dtype('float16'), 'S_12': np.dtype('float16'), 'R_6': np.dtype('float16'), 'S_13': np.dtype('float16'), 'B_21': np.dtype('float16'), 'D_69': np.dtype('float16'), 'B_22': np.dtype('float16'), 'D_70': np.dtype('float16'), 'D_71': np.dtype('float16'), 'D_72': np.dtype('float16'), 'S_15': np.dtype('float16'), 'B_23': np.dtype('float16'), 'D_73': np.dtype('float16'), 'P_4': np.dtype('float16'), 'D_74': np.dtype('float16'), 'D_75': np.dtype('float16'), 'D_76': np.dtype('float16'), 'B_24': np.dtype('float16'), 'R_7': np.dtype('float16'), 'D_77': np.dtype('float16'), 'B_25': np.dtype('float16'), 'B_26': np.dtype('float16'), 'D_78': np.dtype('float16'), 'D_79': np.dtype('float16'), 'R_8': np.dtype('float16'), 'R_9': np.dtype('float16'), 'S_16': np.dtype('float16'), 'D_80': np.dtype('float16'), 'R_10': np.dtype('float16'), 'R_11': np.dtype('float16'), 'B_27': np.dtype('float16'), 'D_81': np.dtype('float16'), 'D_82': np.dtype('float16'), 'S_17': np.dtype('float16'), 'R_12': np.dtype('float16'), 'B_28': np.dtype('float16'), 'R_13': np.dtype('float16'), 'D_83': np.dtype('float16'), 'R_14': np.dtype('float16'), 'R_15': np.dtype('float16'), 'D_84': np.dtype('float16'), 'R_16': np.dtype('float16'), 'B_29': np.dtype('float16'), 'S_18': np.dtype('float16'), 'D_86': np.dtype('float16'), 'D_87': np.dtype('float16'), 'R_17': np.dtype('float16'), 'R_18': np.dtype('float16'), 'D_88': np.dtype('float16'), 'B_31': np.dtype('int8'), 'S_19': np.dtype('float16'), 'R_19': np.dtype('float16'), 'B_32': np.dtype('float16'), 'S_20': np.dtype('float16'), 'R_20': np.dtype('float16'), 'R_21': np.dtype('float16'), 'B_33': np.dtype('float16'), 'D_89': np.dtype('float16'), 'R_22': np.dtype('float16'), 'R_23': np.dtype('float16'), 'D_91': np.dtype('float16'), 'D_92': np.dtype('float16'), 'D_93': np.dtype('float16'), 'D_94': np.dtype('float16'), 'R_24': np.dtype('float16'), 'R_25': np.dtype('float16'), 'D_96': np.dtype('float16'), 'S_22': np.dtype('float16'), 'S_23': np.dtype('float16'), 'S_24': np.dtype('float16'), 'S_25': np.dtype('float16'), 'S_26': np.dtype('float16'), 'D_102': np.dtype('float16'), 'D_103': np.dtype('float16'), 'D_104': np.dtype('float16'), 'D_105': np.dtype('float16'), 'D_106': np.dtype('float16'), 'D_107': np.dtype('float16'), 'B_36': np.dtype('float16'), 'B_37': np.dtype('float16'), 'R_26': np.dtype('float16'), 'R_27': np.dtype('float16'), 'D_108': np.dtype('float16'), 'D_109': np.dtype('float16'), 'D_110': np.dtype('float16'), 'D_111': np.dtype('float16'), 'B_39': np.dtype('float16'), 'D_112': np.dtype('float16'), 'B_40': np.dtype('float16'), 'S_27': np.dtype('float16'), 'D_113': np.dtype('float16'), 'D_115': np.dtype('float16'), 'D_118': np.dtype('float16'), 'D_119': np.dtype('float16'), 'D_121': np.dtype('float16'), 'D_122': np.dtype('float16'), 'D_123': np.dtype('float16'), 'D_124': np.dtype('float16'), 'D_125': np.dtype('float16'), 'D_127': np.dtype('float16'), 'D_128': np.dtype('float16'), 'D_129': np.dtype('float16'), 'B_41': np.dtype('float16'), 'B_42': np.dtype('float16'), 'D_130': np.dtype('float16'), 'D_131': np.dtype('float16'), 'D_132': np.dtype('float16'), 'D_133': np.dtype('float16'), 'R_28': np.dtype('float16'), 'D_134': np.dtype('float16'), 'D_135': np.dtype('float16'), 'D_136': np.dtype('float16'), 'D_137': np.dtype('float16'), 'D_138': np.dtype('float16'), 'D_139': np.dtype('float16'), 'D_140': np.dtype('float16'), 'D_141': np.dtype('float16'), 'D_142': np.dtype('float16'), 'D_143': np.dtype('float16'), 'D_144': np.dtype('float16'), 'D_145': np.dtype('float16')}

# processing test data

In [None]:
%%time

test_submission = pd.read_csv(
    input_path / 'sample_submission.csv'
)
TOTAL_TEST_CUSTOMERS = len(test_submission)

In [None]:
test_customer_id_map = pd.DataFrame({
    'customer_ID': test_submission.customer_ID.unique().tolist()
})
test_customer_id_map = test_customer_id_map.reset_index().rename(columns={'index':'new_customer_id'})

In [None]:
test_submission = test_customer_id_map.merge(test_submission)
test_submission.to_feather(DATA_DIR+'sample_submission.feather')
test_submission.head()

In [None]:
del test_submission
gc.collect()

In [None]:
%%time

test_tmp = pd.read_csv(
    input_path / 'test_data.csv',
    dtype=col_dtypes,
    chunksize=100000
)

test_data = pd.DataFrame()
for itr, row in enumerate(test_tmp):
    row = test_customer_id_map.merge(row)
    row.drop('customer_ID', axis=1, inplace=True)
    
    test_data = pd.concat((test_data, row), axis=0, sort=False, ignore_index=True)
    print(f"completed itr # {itr}")

del test_tmp
gc.collect()

print(test_data.shape)

In [None]:
test_data[cat_cols] = test_data[cat_cols].astype('object')
test_data.head()

In [None]:
# saving all the test data
test_customer_id_map.to_feather(DATA_DIR+'test_customer_id_map.feather')
test_data.to_feather(DATA_DIR+"test_data.feather")

In [None]:
del test_data, test_customer_id_map
gc.collect()

# processing train data

In [None]:
train_labels = pd.read_csv(input_path / 'train_labels.csv', dtype={'target': np.dtype('int8')})

In [None]:
train_customer_id_map = pd.DataFrame({
    'customer_ID': train_labels.customer_ID.unique().tolist()
})
train_customer_id_map = train_customer_id_map.reset_index().rename(columns={'index':'new_customer_id'})
train_customer_id_map['new_customer_id'] = TOTAL_TEST_CUSTOMERS + train_customer_id_map['new_customer_id']

In [None]:
train_labels = train_customer_id_map.merge(train_labels)
train_labels.drop('customer_ID', axis=1, inplace=True)

In [None]:
%%time

train_tmp = pd.read_csv(
    input_path / 'train_data.csv',
    dtype=col_dtypes,
    chunksize=100000
)

train_data = pd.DataFrame()
for itr, row in enumerate(train_tmp):
    row = train_customer_id_map.merge(row)
    row.drop('customer_ID', axis=1, inplace=True)
    
    train_data = pd.concat((train_data, row), axis=0, sort=False, ignore_index=True)
    print(f"completed itr # {itr}")

del train_tmp
gc.collect()

print(train_data.shape)

In [None]:
train_data[cat_cols] = train_data[cat_cols].astype('object')
train_data.head()

In [None]:
# saving all the training data

train_data.to_feather(DATA_DIR+"train_data.feather")
train_customer_id_map.to_feather(DATA_DIR+'train_customer_id_map.feather')
train_labels.to_feather(DATA_DIR+"train_labels.feather")

In [None]:
del train_data, train_customer_id_map, train_labels
gc.collect()

### saving the dataset as private kaggle dataset for quickly loading them next time

In [None]:
# In order to use the Kaggle’s public API, you must first authenticate using an API token. From the site header, click on your user profile picture, then on “Account” from the dropdown menu. 
# This will take you to your account settings at https://www.kaggle.com/account. Scroll down to the section of the page labelled API:
# To create a new token, click on the “Create New API Token” button. This will download a fresh authentication token onto your machine.

# Open the kaggle.json file and replace the USER_ID and USER_SECRET key accordingly
# Create a JSON file containing user-specific metadata. 
USER_ID = 'xxxxx' # REPLACE WITH YOUR OWN USER NAME
USER_SECRET = 'xxxxxxxxxxxxxx' # REPLACE WITH YOUR OWN PRIVATE API TOKEN 

import os, json, nbformat, pandas as pd
KAGGLE_CONFIG_DIR = os.path.join(os.path.expandvars('$HOME'), '.kaggle')
os.makedirs(KAGGLE_CONFIG_DIR, exist_ok = True)
with open(os.path.join(KAGGLE_CONFIG_DIR, 'kaggle.json'), 'w') as f:
    json.dump({'username': USER_ID, 'key': USER_SECRET}, f)
!chmod 600 {KAGGLE_CONFIG_DIR}/kaggle.json

### uncomment all the below cells and run them successfully to create your own private dataset

In [None]:
# !kaggle datasets init -p {DATA_DIR}

### Updating the metadata file to have a proper `title` and `id` for future reference

In [None]:
# with open('../working/data/dataset-metadata.json', 'r') as f:
#     metadata = json.load(f)
#     metadata['title'] = 'Amex Competition 2022 Dataset'
#     metadata['id'] = f'{USER_ID}/amex-comp-2022'

# with open('../working/data/dataset-metadata.json', 'w') as f:
#     json.dump(metadata, f)

In [None]:
# !kaggle datasets create -p {DATA_DIR}

NOTE: 
* We can always create multiple intermediate processed files and upload them to the same dataset. This will avoid the pain of unnecessary long runs. 
* We can directly start consuming the datasets on a newly launched kernel by adding the dataset on the top right corner of the kernel page.
* If this is followed effectively, we can successful create a pipeline of connected notebooks

### Add the above created data to the existing kernel and run the below cell. The cell will be executed in ~45 seconds!!!

In [None]:
# %%time

# train_data = pd.read_feather('../input/amex-comp-2022/train_data.feather')
# train_labels = pd.read_feather('../input/amex-comp-2022/train_labels.feather')
# test_data = pd.read_feather('../input/amex-comp-2022/test_data.feather')

# print(train_data.shape, test_data.shape)

##############################
## Output
##############################
## (5531451, 190) (11263762, 190)
## CPU times: user 26.3 s, sys: 14.6 s, total: 40.9 s
## Wall time: 45.9 s"