# Convert amex data to feather format, staying under 16GB memory ceiling

This notebook shows how this conversion can be done, running successfully in a default kaggle instance (which has a maximum of 16 GB memory)

This is done by chunking the read of the csv file, converting each chunk to more memory efficient data types, writing each chunk to a feather file, and later loading all of the chunks into a single data frame and saving them again.

# Imports

In [None]:
import pandas as pd
import numpy as np
import time
import datetime
import gc

# Define data frame conversion
This routine sets all floating point values to half precision (float16), encodes categorical columns as category type, converts the date format to an integer (measuring number of days since 1/1/2000), and converts the customer_ID to a 64 bit integer. I separately verified that no customer ID's in the data conver to the same 64 bit integer.

In [None]:
def customer_ID_hash(customer_ID):
    # convert customer ID to 64 bit integer
    return np.int64([int(s[49:],16) for s in customer_ID])
def convert_data_frame(df):
    conv_d={}
    # convert floating point to 16 bit
    for n,col in enumerate(df.columns):
        if df[col].dtype==np.float64:
            out_type=np.float16
        else:
            out_type=df[col].dtype
        conv_d[col]=out_type

    categories = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    for col in categories:
        if col in df.columns:
            conv_d[col]='category'
    # B_31 is binary, represent as uint8
    if 'B_31' in df.columns:
        conv_d['B_31']=np.uint8    

    dfr=df.astype(conv_d)
    
    if 'S_2' in df.columns:
        dfr.S_2=((pd.to_datetime(df.S_2)-datetime.datetime(2000,1,1)).view(int)//(1000000000*3600*24)).astype('int16')
    if 'customer_ID' in df.columns:
        dfr.customer_ID=customer_ID_hash(df.customer_ID)
    return dfr


# Convert to feather chunks

This code loads a csv file (using chunking), converts each chunk to a feather file and writes it to disk.

In [None]:
# chunk
def convert_and_write_feather_chunks(csvfile,stub='tmp'):
    tstart=time.time()
    with pd.read_csv(csvfile,chunksize=1000000) as reader:
        for n,chunk in enumerate(reader):
            chunk.reset_index(inplace=True)
            chunk_c=convert_data_frame(chunk)
            fname='%s%05d.feather'%(stub,n)
            chunk_c.to_feather(fname)
            print('writing chunk',n,fname)
            del chunk
            del chunk_c
            gc.collect()
    maxchunk=n
    return maxchunk

# Load feather chunks
This code loads a series of feather files, and concatenates them into a single data frame

In [None]:
def load_feather_chunks(maxchunk,stub='tmp'):
    chunk_c_list=[]
    gc.collect()
    for n in range(maxchunk+1):
        fname='%s%05d.feather'%(stub,n)
        print('loading chunk',n,fname)
        chunk_c_list.append(pd.read_feather(fname))
    odf=pd.concat(chunk_c_list)
    chunk_c_list=[]
    gc.collect()
    odf.reset_index(inplace=True)
    odf.drop(['level_0','index'],axis=1,inplace=True)
    return odf


# Apply code to test_data, train_data
Now, call these routines on the test_data and train_data csv files

In [None]:
tstart=time.time()
maxchunk=convert_and_write_feather_chunks('../input/amex-default-prediction/train_data.csv',stub='tmp_train_data')
gc.collect()
elapsed_time=time.time()-tstart
print(elapsed_time)

In [None]:
tstart=time.time()
odf=load_feather_chunks(maxchunk,stub='tmp_train_data')
odf.to_feather('train_data.feather')
odf=''
gc.collect()
elapsed_time=time.time()-tstart
print(elapsed_time)

In [None]:
tstart=time.time()
maxchunk=convert_and_write_feather_chunks('../input/amex-default-prediction/test_data.csv',stub='tmp_test_data')
gc.collect()
print(elapsed_time)

In [None]:
tstart=time.time()
odf=load_feather_chunks(maxchunk,stub='tmp_test_data')
odf.to_feather('test_data.feather')
odf=''
gc.collect()
elapsed_time=time.time()-tstart
print(elapsed_time)

# Convert train_labels
This file was small enough that no chunking is needed, so just load the entire file, convert the data frame, and save as a feather file.

In [None]:
df_train_labels=pd.read_csv('../input/amex-default-prediction/train_labels.csv')
df_train_labels_c=convert_data_frame(df_train_labels)
df_train_labels_c.to_feather('train_labels.feather')
del df_train_labels
del df_train_labels_c
gc.collect()

# Clean up
Delete temporary chunked feather files

In [None]:
!rm tmp_test_data*
!rm tmp_train_data*