In [None]:
import os
import sys
import time
import warnings
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import missingno as msno

import gc

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# display settings
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

In [None]:
# data_PATH
TRAIN_DATA_PATH = "../input/amex-default-prediction/train_data.csv"
TEST_DATA_PATH = "../input/amex-default-prediction/test_data.csv"
TRAIN_LABELS_PATH = "../input/amex-default-prediction/train_labels.csv"

In [None]:
# train_labels
train_labels_df = pd.read_csv(TRAIN_LABELS_PATH)
print(train_labels_df.shape)
print(train_labels_df['target'].value_counts())

In [None]:
train_df_spl = pd.read_csv(TRAIN_DATA_PATH, nrows=10)
all_cols = train_df_spl.columns.tolist()
print(train_df_spl.columns.tolist())

In [None]:
# columns category group list for chunk loading
cols01 = [elem for elem in all_cols[1:] if 'D' in elem ][0:32]
print("cols01: " + str(len(cols01)))
print(cols01)

cols02 = [elem for elem in all_cols[1:] if 'D' in elem ][32:64]
print("cols02: " + str(len(cols02)))
print(cols02)

cols03 = [elem for elem in all_cols[1:] if 'D' in elem ][64:96]
print("cols03: " + str(len(cols03)))
print(cols03)

cols04 = [elem for elem in all_cols[1:] if 'B' in elem ]
print("cols04: " + str(len(cols04)))
print(cols04)

cols05 = [elem for elem in all_cols[1:] if 'R' in elem ]
print("cols05: " + str(len(cols05)))
print(cols05)

cols06 = [elem for elem in all_cols[1:] if ('S' in elem) or ('P' in elem) ]
print("cols06: " + str(len(cols06)))
print(cols06)

In [None]:
%%time
# load data for NA count (train/test)

usecol_list = [cols01,cols02,cols03,cols04,cols05,cols06]
na_summary_tab = pd.DataFrame()

for col_ in usecol_list:
    chunksize = 1000000
    # train_df = train_df_iter.__next__()
    train_df_iter = pd.read_csv(TRAIN_DATA_PATH, chunksize=chunksize, usecols=["customer_ID"] + col_)
    train_df_ = pd.DataFrame()
    for i_chunk, chunk in enumerate(train_df_iter):
        train_df_ = pd.concat([train_df_, chunk])
        print(train_df_.shape)
    del chunk

    # msno.matrix(train_df_, figsize=(20,5), color=(0.2,0.2,0.5))

    train_nan_tab = pd.DataFrame(train_df_.isnull().sum(), columns=['train_NA_count'])
    train_nap_tab = pd.DataFrame(train_df_.isnull().sum()/len(train_df_)*100, columns=['train_NA_pct'])

    del train_df_
    gc.collect()


    chunksize = 1000000
    # teat_df = test_df_iter.__next__()
    test_df_iter = pd.read_csv(TEST_DATA_PATH, chunksize=chunksize, usecols=["customer_ID"] + col_)
    test_df_ = pd.DataFrame()
    for i_chunk, chunk in enumerate(test_df_iter):
        test_df_ = pd.concat([test_df_, chunk])
        print(test_df_.shape)
    del chunk

    # msno.matrix(test_df_, figsize=(20,5), color=(0.5,0.2,0.2))

    test_nan_tab = pd.DataFrame(test_df_.isnull().sum(), columns=['test_NA_count'])
    test_nap_tab = pd.DataFrame(test_df_.isnull().sum()/len(test_df_)*100, columns=['test_NA_pct'])

    del test_df_
    gc.collect()


    nan_tab = pd.merge(train_nan_tab, test_nan_tab,
                       left_index=True, right_index=True, how='outer')
    nap_tab = pd.merge(train_nap_tab, test_nap_tab,
                       left_index=True, right_index=True, how='outer')

    na_tab = pd.merge(nan_tab, nap_tab,
                      left_index=True, right_index=True, how='outer')
    gc.collect()

    na_summary_tab = pd.concat([na_summary_tab, na_tab])

# save na summary (train/test)
na_summary_tab.to_csv('na_summary_table.csv', index=True)

In [None]:
na_summary_tab.drop(index='customer_ID')

In [None]:
na_summary_tab.query('train_NA_count == 0 and test_NA_count ==0').drop(index='customer_ID')