In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import lightgbm as lgb
from tqdm.notebook import tqdm

In [None]:
def reduce_memory_usage(df, chunk=None):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print("Initial Memory chunk: {:.3f}".format(start_mem))
    
    for col in df.columns:
        type_ = df[col].dtype
        
        if str(type_) != "object":
            if str(type_)[:3] == "int":
                min_ = df[col].min()
                max_ = df[col].max()
                df[col] = df[col].astype(np.int8)
                #if min_ > np.iinfo(np.int8).min and max_ < np.iinfo(np.int8).max:
                #    df[col] = df[col].astype(np.int8)
                #elif min_ > np.iinfo(np.int16).min and max_ < np.iinfo(np.int16).max:
                #    df[col] = df[col].astype(np.int16)
                #elif min_ > np.iinfo(np.int32).min and max_ < np.iinfo(np.int32).max:
                #    df[col] = df[col].astype(np.int32)
                #else:
                #    df[col] = df[col].astype(np.int64)
            else:
                min_ = df[col].min()
                max_ = df[col].max()
                df[col] = df[col].astype(np.float16)
                #if min_ > np.finfo(np.float16).min and max_ < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #elif min_ > np.finfo(np.float32).min and max_ < np.finfo(np.float32).max:
                #    df[col] = df[col].astype(np.float32)
                #else:
                #    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")
    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Final Memory chunk: {:.3f}".format(end_mem))
    print("Reduced by: {:.2f}".format((start_mem - end_mem) / start_mem))
    return df

In [None]:
chunksize=1.5*(10**5)
train_df = pd.DataFrame()
with pd.read_csv('../input/amex-default-prediction/train_data.csv', chunksize=chunksize) as reader:
    counter=0
    for chunk in tqdm(reader):
        print(counter, flush=True)
        chunk=reduce_memory_usage(chunk)
        train_df = pd.concat([train_df, chunk])
        counter+=1

In [None]:
gc.collect()
train_df.info()

In [None]:
labels=pd.read_csv('../input/amex-default-prediction/train_labels.csv')
labels=reduce_memory_usage(labels)

In [None]:
train_df=pd.merge(train_df, labels, on='customer_ID')

In [None]:
gc.collect()

In [None]:
os.makedirs('./Amex date pickled')

In [None]:
train_df.to_pickle('./Amex date pickled/train.pkl')

In [None]:
del train_df
del labels
gc.collect()

In [None]:
chunksize=1.5*(10**5)
test_df = pd.DataFrame()
with pd.read_csv('../input/amex-default-prediction/test_data.csv', chunksize=chunksize) as reader:
    counter=0
    for chunk in tqdm(reader):
        print(counter, flush=True)
        chunk=reduce_memory_usage(chunk)
        test_df = pd.concat([test_df, chunk])
        counter+=1

In [None]:
gc.collect()

In [None]:
test_df.info()

In [None]:
test_df.to_pickle('./Amex date pickled/test.pkl')