In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# AMEX Competition - Reduce data size by optimal dtypes

We will use the parquet format of the dataset created by @odins0n for some data exploration. Parquet format is faster, more compressed, and saves the dtypes of each column when we read and write.

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np


TRAIN_FILE = "/kaggle/input/amex-parquet/train_data.parquet"
TEST_FILE = "/kaggle/input/amex-parquet/test_data.parquet"

# Total train data size is ~4GB

In [None]:
train_df = pd.read_parquet(TRAIN_FILE)
train_df.info()

In [None]:
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68','target']
date_cols = ['S_2']
numeric_cols = set(train_df.columns) - set(date_cols) - set(categorical_cols) - set(['customer_ID'])

# Create a data frame with numerical data types and their ranges

In [None]:
num_types = [np.int8, np.int16, np.int32, np.int64,
             np.uint8, np.uint16, np.uint32, np.uint64, 
             np.float16, np.float32, np.float64, np.float128]
num_types = [[np_type.__name__
              , 'integer' if np.issubdtype(np_type, np.integer) else 'float'] 
             for np_type in num_types]
types_df = pd.DataFrame(data=num_types, columns=['class_type','class_subtype'])
types_df['min_value'] = types_df.apply(lambda row: np.iinfo(row.class_type).min 
                                       if row.class_subtype == 'integer' 
                                       else np.finfo(row.class_type).min, axis=1)
types_df['max_value'] = types_df.apply(lambda row: np.iinfo(row.class_type).max 
                                       if row.class_subtype == 'integer' 
                                       else np.finfo(row.class_type).max, axis=1)

types_df['range'] = types_df['max_value'] - types_df['min_value']
types_df.sort_values(by='range', inplace=True)


In [None]:
schema = {}

for col in numeric_cols:
    col_min = train_df[col].min()
    col_max = train_df[col].max()
    col_subtype = 'float'
    if np.issubdtype(train_df[col].dtype, np.integer):
        col_subtype = 'integer'

    temp = types_df[(types_df['min_value'] <= col_min) 
                    & (types_df['max_value'] >= col_max)
                    & (types_df['class_subtype'] == col_subtype)
                   ]
    optimized_class = temp.loc[temp['range'].idxmin(), 'class_type']
    schema[col] = optimized_class
for col in categorical_cols:
    schema[col] = 'category'
for col in date_cols:
    schema[col] = 'datetime64[ns]'

# Convert dataframe to optimal dtypes

In [None]:
train_df = train_df.astype(schema)
train_df.info()

# Train dataset size reduced to 2GB

Saving this dataframe to parquet fails as parquet files do not support 'float16'.
See: https://issues.apache.org/jira/browse/PARQUET-1647

Using float16 dtype with Pandas isn't recommended, but we'll set this type to be able to read data easily then we can adjust when manipulating the dataset.

Github Issue for float16 with Pandas: https://github.com/pandas-dev/pandas/issues/9220