In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import dask.dataframe as dd
import gc
import tensorflow as tf
print(tf.__version__)

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
train_df = pd.read_csv('../input/amex-default-prediction/train_data.csv', nrows=10)
train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv',)
test_df = pd.read_csv('../input/amex-default-prediction/test_data.csv', nrows=10)

In [None]:
train_df.head()

In [None]:
dtypes_df = train_df.dtypes.to_frame().reset_index()

# Convert `float64` --> `float16` and category cols to `int8` and str types to decrease ram use usage

In [None]:
dtype_dict = {}

In [None]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
for col in cat_cols:
    if train_df[col].dtype == "float64":
        dtype_dict[col] = "int8" # category to 
    else:
        dtype_dict[col] = str
    

In [None]:
for col in train_df.columns:
    
    if train_df[col].dtype == "float64":
        dtype_dict[col] = "float16"
    
        

# Load whole data with predefined data types

In [None]:
train_df = pd.read_csv('../input/amex-default-prediction/train_data.csv', dtype=dtype_dict)

In [None]:
gc.collect()

In [None]:
train_df['S_2'] = pd.to_datetime(train_df['S_2'])

In [None]:
gc.collect()

In [None]:
num_cols = []
for col in train_df.columns:
    if col != "S_2" and col not in cat_cols:
        num_cols.append(col)

# Lets see target variable distribution

In [None]:
train_labels['customer_ID'] = train_labels['customer_ID'].astype(str)
train_labels['target'] = train_labels['target'].astype("int8") # to decrease ram usage

In [None]:
train_labels['customer_ID'].nunique()

In [None]:
gc.collect()

In [None]:
train_labels.target.value_counts().plot(kind='bar')

# Unique Customers

In [None]:
train_df.customer_ID.nunique() # same as labels df

# Lets Aggregate Numerical columns for each customer

In [None]:
num_cols_grp_df = train_df.groupby('customer_ID')[num_cols].agg(['mean', 'std', 'min', 'max'])

In [None]:
num_cols_grp_df.columns = num_cols_grp_df.columns.to_flat_index()
num_cols_grp_df = num_cols_grp_df.reset_index()
new_num_cols = ['customer_ID']
for col in num_cols_grp_df.columns[1:]:
    new_num_cols.append("_".join(col))
num_cols_grp_df.columns = new_num_cols
gc.collect()

In [None]:
num_cols_grp_df['customer_ID'] = num_cols_grp_df['customer_ID'].astype(str)

In [None]:
num_cols_grp_df.head()

In [None]:
cat_cols_grp_df = train_df.groupby('customer_ID')[cat_cols].agg(['count', 'last', 'nunique'])

In [None]:
cat_cols_grp_df.columns = cat_cols_grp_df.columns.to_flat_index()
cat_cols_grp_df = cat_cols_grp_df.reset_index()
new_cat_cols = ['customer_ID']
for col in cat_cols_grp_df.columns[1:]:
    new_cat_cols.append("_".join(col))
cat_cols_grp_df.columns = new_cat_cols
gc.collect()

In [None]:
cat_cols_grp_df.head()

In [None]:
for col in cat_cols_grp_df.columns:
    if col == "customer_ID":
        cat_cols_grp_df[col] = cat_cols_grp_df[col].astype(str)
    if cat_cols_grp_df[col].dtype == "int64":
        cat_cols_grp_df[col] = cat_cols_grp_df[col].astype("int8")

In [None]:
num_cols_grp_df.shape, cat_cols_grp_df.shape, train_labels.shape

In [None]:
num_cols_grp_df = num_cols_grp_df.sort_values(by='customer_ID')
cat_cols_grp_df = cat_cols_grp_df.sort_values(by='customer_ID')
train_labels = train_labels.sort_values(by='customer_ID')

In [None]:
final_df = pd.concat([cat_cols_grp_df, num_cols_grp_df.drop(['customer_ID'], axis=1), train_labels.drop(['customer_ID'], axis=1)], axis=1)

In [None]:
(num_cols_grp_df.isna().sum(axis=0).sort_values(ascending=False) > 450000).sum()

* There are 29 columns which have  more than 450000 null values

In [None]:
final_df.to_pickle("./train_agg.pkl", compression="gzip")