In [None]:
# This notebook explains how to convert a CSV to feather file format and other downcasting approach.

In [None]:
# List of imports

import pandas as pd 
import numpy as np
import tqdm
import gc

import os

In [None]:
PATH = '../input/amex-default-prediction/'

In [None]:
for file in os.listdir(PATH):
    print('{} has size {} mb'.format(file , round(os.stat(os.path.join(PATH, file)).st_size/(1024*1024),3)))

In [None]:
# Good old way 
!ls -lh {PATH}

In [None]:
# Only load the first 5 rows to get an idea of what the data look like
df_temp = pd.read_csv(f'{PATH}train_data.csv', nrows=5)
df_temp.head()

In [None]:
# Information on Datatype
df_temp.info()

In [None]:
float_cols = df_temp.select_dtypes(include=['float'])
int_cols = df_temp.select_dtypes(include=['int'])
cat_cols = df_temp.select_dtypes(include=['object'])

for cols in float_cols.columns:
    df_temp[cols] = pd.to_numeric(df_temp[cols], downcast='float')
    
for cols in int_cols.columns:
    df_temp[cols] = pd.to_numeric(df_temp[cols], downcast='integer')

    
print(df_temp.info())

In [None]:
# Defining the dtype ( pandas ) to be used while importing 
# Downcasting float64 -> float32
# Converting Object -> category format

dtypes = {
    'customer_ID': "object",
     'S_2': "object", # This is a date object . Can be taken care by parse_dates while importing
     'P_2': 'float16',
     'D_39': 'float16',
     'B_1': 'float16',
     'B_2': 'float16',
     'R_1': 'float16',
     'S_3': 'float16',
     'D_41': 'float16',
     'B_3': 'float16',
     'D_42': 'float16',
     'D_43': 'float16',
     'D_44': 'float16',
     'B_4': 'float16',
     'D_45': 'float16',
     'B_5': 'float16',
     'R_2': 'float16',
     'D_46': 'float16',
     'D_47': 'float16',
     'D_48': 'float16',
     'D_49': 'float16',
     'B_6': 'float16',
     'B_7': 'float16',
     'B_8': 'float16',
     'D_50': 'float16',
     'D_51': 'float16',
     'B_9': 'float16',
     'R_3': 'float16',
     'D_52': 'float16',
     'P_3': 'float16',
     'B_10': 'float16',
     'D_53': 'float16',
     'S_5': 'float16',
     'B_11': 'float16',
     'S_6': 'float16',
     'D_54': 'float16',
     'R_4': 'float16',
     'S_7': 'float16',
     'B_12': 'float16',
     'S_8': 'float16',
     'D_55': 'float16',
     'D_56': 'float16',
     'B_13': 'float16',
     'R_5': 'float16',
     'D_58': 'float16',
     'S_9': 'float16',
     'B_14': 'float16',
     'D_59': 'float16',
     'D_60': 'float16',
     'D_61': 'float16',
     'B_15': 'float16',
     'S_11': 'float16',
     'D_62': 'float16',
     'D_63': 'category', # Define as category datatype
     'D_64': 'category',  # Define as category datatype
     'D_65': 'float16',
     'B_16': 'float16',
     'B_17': 'float16',
     'B_18': 'float16',
     'B_19': 'float16',
     'D_66': 'float16',
     'B_20': 'float16',
     'D_68': 'float16',
     'S_12': 'float16',
     'R_6': 'float16',
     'S_13': 'float16',
     'B_21': 'float16',
     'D_69': 'float16',
     'B_22': 'float16',
     'D_70': 'float16',
     'D_71': 'float16',
     'D_72': 'float16',
     'S_15': 'float16',
     'B_23': 'float16',
     'D_73': 'float16',
     'P_4': 'float16',
     'D_74': 'float16',
     'D_75': 'float16',
     'D_76': 'float16',
     'B_24': 'float16',
     'R_7': 'float16',
     'D_77': 'float16',
     'B_25': 'float16',
     'B_26': 'float16',
     'D_78': 'float16',
     'D_79': 'float16',
     'R_8': 'float16',
     'R_9': 'float16',
     'S_16': 'float16',
     'D_80': 'float16',
     'R_10': 'float16',
     'R_11': 'float16',
     'B_27': 'float16',
     'D_81': 'float16',
     'D_82': 'float16',
     'S_17': 'float16',
     'R_12': 'float16',
     'B_28': 'float16',
     'R_13': 'float16',
     'D_83': 'float16',
     'R_14': 'float16',
     'R_15': 'float16',
     'D_84': 'float16',
     'R_16': 'float16',
     'B_29': 'float16',
     'B_30': 'float16',
     'S_18': 'float16',
     'D_86': 'float16',
     'D_87': 'float16',
     'R_17': 'float16',
     'R_18': 'float16',
     'D_88': 'float16',
     'B_31': 'int64',
     'S_19': 'float16',
     'R_19': 'float16',
     'B_32': 'float16',
     'S_20': 'float16',
     'R_20': 'float16',
     'R_21': 'float16',
     'B_33': 'float16',
     'D_89': 'float16',
     'R_22': 'float16',
     'R_23': 'float16',
     'D_91': 'float16',
     'D_92': 'float16',
     'D_93': 'float16',
     'D_94': 'float16',
     'R_24': 'float16',
     'R_25': 'float16',
     'D_96': 'float16',
     'S_22': 'float16',
     'S_23': 'float16',
     'S_24': 'float16',
     'S_25': 'float16',
     'S_26': 'float16',
     'D_102': 'float16',
     'D_103': 'float16',
     'D_104': 'float16',
     'D_105': 'float16',
     'D_106': 'float16',
     'D_107': 'float16',
     'B_36': 'float16',
     'B_37': 'float16',
     'R_26': 'float16',
     'R_27': 'float16',
     'B_38': 'float16',
     'D_108': 'float16',
     'D_109': 'float16',
     'D_110': 'float16',
     'D_111': 'float16',
     'B_39': 'float16',
     'D_112': 'float16',
     'B_40': 'float16',
     'S_27': 'float16',
     'D_113': 'float16',
     'D_114': 'float16',
     'D_115': 'float16',
     'D_116': 'float16',
     'D_117': 'float16',
     'D_118': 'float16',
     'D_119': 'float16',
     'D_120': 'float16',
     'D_121': 'float16',
     'D_122': 'float16',
     'D_123': 'float16',
     'D_124': 'float16',
     'D_125': 'float16',
     'D_126': 'float16',
     'D_127': 'float16',
     'D_128': 'float16',
     'D_129': 'float16',
     'B_41': 'float16',
     'B_42': 'float16',
     'D_130': 'float16',
     'D_131': 'float16',
     'D_132': 'float16',
     'D_133': 'float16',
     'R_28': 'float16',
     'D_134': 'float16',
     'D_135': 'float16',
     'D_136': 'float16',
     'D_137': 'float16',
     'D_138': 'float16',
     'D_139': 'float16',
     'D_140': 'float16',
     'D_141': 'float16',
     'D_142': 'float16',
     'D_143': 'float16',
     'D_144': 'float16',
     'D_145': 'float16'
}

col_names = list(dtypes.keys())

In [None]:
amex_file = [
    'train_data.csv'
]

In [None]:
# to be used in case multiple files are to be concatenated
df_list = []

for i in tqdm.tqdm(amex_file):
    df = pd.read_csv(f'{PATH}'+i, parse_dates = True,  usecols=col_names,dtype=dtypes)
    df_list.append(df)

In [None]:
amex = pd.concat(df_list)

del df_list
gc.collect()

In [None]:
# Save to feather so we can use it in other kernels
amex.reset_index(drop=True).to_feather(f'amex.feather')

In [None]:
!ls -lh 