In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.feather as feather


TRAIN_FILE = r"../amex-default-prediction/train_data.parquet"
TEST_FILE = r"../amex-default-prediction/test_data.parquet"

In [2]:
train_df = pd.read_parquet(TRAIN_FILE)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Columns: 191 entries, customer_ID to target
dtypes: float32(185), int64(2), object(4)
memory usage: 4.1+ GB


In [3]:
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68','target']
date_cols = ['S_2']
numeric_cols = set(train_df.columns) - set(date_cols) - set(categorical_cols) - set(['customer_ID'])

In [4]:
num_types = [np.int8, np.int16, np.int32, np.int64,
             np.uint8, np.uint16, np.uint32, np.uint64, 
             np.float32, np.float64, np.float128]
num_types = [[np_type.__name__
              , 'integer' if np.issubdtype(np_type, np.integer) else 'float'] 
             for np_type in num_types]
types_df = pd.DataFrame(data=num_types, columns=['class_type','class_subtype'])
types_df['min_value'] = types_df.apply(lambda row: np.iinfo(row.class_type).min 
                                       if row.class_subtype == 'integer' 
                                       else np.finfo(row.class_type).min, axis=1)
types_df['max_value'] = types_df.apply(lambda row: np.iinfo(row.class_type).max 
                                       if row.class_subtype == 'integer' 
                                       else np.finfo(row.class_type).max, axis=1)

types_df['range'] = types_df['max_value'] - types_df['min_value']
types_df.sort_values(by='range', inplace=True)

In [5]:
schema = {}

for col in numeric_cols:
    col_min = train_df[col].min()
    col_max = train_df[col].max()
    col_subtype = 'float'
    if np.issubdtype(train_df[col].dtype, np.integer):
        col_subtype = 'integer'

    temp = types_df[(types_df['min_value'] <= col_min) 
                    & (types_df['max_value'] >= col_max)
                    & (types_df['class_subtype'] == col_subtype)
                   ]
    optimized_class = temp.loc[temp['range'].idxmin(), 'class_type']
    schema[col] = optimized_class
for col in categorical_cols:
    schema[col] = 'category'
for col in date_cols:
    schema[col] = 'datetime64[ns]'

In [6]:
train_df = train_df.astype(schema)
train_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Data columns (total 191 columns):
 #    Column       Dtype         
---   ------       -----         
 0    customer_ID  object        
 1    S_2          datetime64[ns]
 2    P_2          float32       
 3    D_39         float32       
 4    B_1          float32       
 5    B_2          float32       
 6    R_1          float32       
 7    S_3          float32       
 8    D_41         float32       
 9    B_3          float32       
 10   D_42         float32       
 11   D_43         float32       
 12   D_44         float32       
 13   B_4          float32       
 14   D_45         float32       
 15   B_5          float32       
 16   R_2          float32       
 17   D_46         float32       
 18   D_47         float32       
 19   D_48         float32       
 20   D_49         float32       
 21   B_6          float32       
 22   B_7          float32       
 23   B_8          float32       
 2

In [7]:
train_df.to_parquet('../amex-default-prediction/train_data_tiny.parquet')

In [3]:
TEST_FILE = "../amex-default-prediction/test_data.csv"

test_df = pd.read_csv(TEST_FILE)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 16.1+ GB


In [4]:
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
date_cols = ['S_2']
numeric_cols = set(test_df.columns) - set(date_cols) - set(categorical_cols) - set(['customer_ID'])

num_types = [np.int8, np.int16, np.int32, np.int64,
             np.uint8, np.uint16, np.uint32, np.uint64, 
             np.float16, np.float32, np.float64]
num_types = [[np_type.__name__
              , 'integer' if np.issubdtype(np_type, np.integer) else 'float'] 
             for np_type in num_types]
types_df = pd.DataFrame(data=num_types, columns=['class_type','class_subtype'])
types_df['min_value'] = types_df.apply(lambda row: np.iinfo(row.class_type).min 
                                       if row.class_subtype == 'integer' 
                                       else np.finfo(row.class_type).min, axis=1)
types_df['max_value'] = types_df.apply(lambda row: np.iinfo(row.class_type).max 
                                       if row.class_subtype == 'integer' 
                                       else np.finfo(row.class_type).max, axis=1)

types_df['range'] = types_df['max_value'] - types_df['min_value']
types_df.sort_values(by='range', inplace=True)

In [8]:
schema = {}

for col in numeric_cols:
    col_min = test_df[col].min()
    col_max = test_df[col].max()
    col_subtype = 'float'
    if np.issubdtype(test_df[col].dtype, np.integer):
        col_subtype = 'integer'

    temp = types_df[(types_df['min_value'] <= col_min) 
                    & (types_df['max_value'] >= col_max)
                    & (types_df['class_subtype'] == col_subtype)
                   ]
    optimized_class = temp.loc[temp['range'].idxmin(), 'class_type']
    schema[col] = optimized_class
for col in categorical_cols:
    schema[col] = 'category'
for col in date_cols:
    schema[col] = 'datetime64[ns]'

In [10]:
test_df = test_df.astype(schema)
test_df.info(verbose=True)

NameError: name 'test_df' is not defined

In [None]:
test_df.to_feather('../amex-default-prediction/test_data.feather')