In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Please refer to [this](https://www.kaggle.com/code/sravanneeli/convert-train-and-test-multiple-parquet-files) notebook for creating train and test parquet files

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
 
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "15g") \
    .appName('amex-test') \
    .getOrCreate()

In [None]:
train_df = spark.read.parquet('../input/amex-default-prediction-parquet-files/train*.pqt')

In [None]:
train_df.printSchema()

In [None]:
from pyspark.sql.types import StringType, StructType
import pyspark.sql.functions as func 

In [None]:
train_df = train_df.withColumn("customer_ID",train_df["customer_ID"].cast(StringType()))

In [None]:
print(f"Total number of rows: {train_df.count()} and Tota number of cols: {len(train_df.columns)}")

In [None]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [None]:
num_cols = []
for col in train_df.columns[2:]:
    if col not in cat_cols:
        num_cols.append(col)


## Group By for each Individual Customer ID

## The usage of pyspark
* I observed in other notebook they where using feather data which was created by somebody but there was no proper source where that has been created.
* So I though and first create parquet files from csv files which I attached above.
* Once of we have parquet files we can now load `train` and `test` data using pyspark which will do aggregation with parallel manner with very less concumption RAM.
* Order each pyspark dataframe with `customer_ID` and at the end convert them to pandas DataFrame and just concat them horizontally because all are sorted.

In [None]:
%%time
count_exprs = {col: 'count' for col in cat_cols}
cat_count_df = train_df.groupby('customer_ID').agg(count_exprs).orderBy('customer_ID', ascending=True).toPandas()

In [None]:
for col in cat_count_df.columns[1:]:
    cat_count_df[col] = cat_count_df[col].astype('int8')

In [None]:
cat_count_df.shape

In [None]:
gc.collect()

In [None]:
%%time
last_exprs = {col: 'last' for col in cat_cols}
cat_last_df = train_df.groupby('customer_ID').agg(last_exprs).orderBy('customer_ID', ascending=True).toPandas().drop('customer_ID', axis=1)

In [None]:
for col in cat_last_df:
    if cat_last_df[col].dtype == "float32":
        cat_last_df[col] = cat_last_df[col].astype('float16')

In [None]:
cat_last_df.shape

In [None]:
gc.collect()

In [None]:
def grp_unique_count(train_df):
    agg_df = []
    for col in cat_cols:
        agg_df.append(train_df.groupby('customer_ID').agg(func.expr(f'count(distinct {col})').alias(f'nunique({col})')).orderBy('customer_ID', ascending=True).toPandas().drop('customer_ID', axis=1))
    final_df = pd.concat(agg_df, axis=1).astype('int8')
    gc.collect()
    return final_df

In [None]:
%%time
cat_nc_df = grp_unique_count(train_df)

In [None]:
cat_cols_df = pd.concat([cat_count_df, cat_last_df, cat_nc_df], axis=1)

In [None]:
cat_cols_df.to_pickle('cat_cols_df.pkl', compression='gzip')

In [None]:
del(cat_count_df)
del(cat_last_df)
del(cat_nc_df)
del(cat_cols_df)

In [None]:
gc.collect()

In [None]:
def grp_num_cols(train_df):
    def agg_num(agg_func):
        agg_df = []
        for i in range(0, len(num_cols), 20):
            exprs = {col: agg_func for col in num_cols[i:i+20]}
            agg_df.append(train_df.groupBy('customer_ID').agg(exprs).orderBy('customer_ID', ascending=True).toPandas().drop('customer_ID', axis=1))
        final_df = pd.concat(agg_df, axis=1).astype('float16')
        gc.collect()
        return final_df
    num_mean_df = agg_num("mean")
    num_std_df = agg_num("std")
    num_min_df = agg_num("min")
    num_max_df = agg_num("max")
    final_df = pd.concat([num_mean_df, num_std_df, num_min_df, num_max_df], axis=1)
    return final_df

In [None]:
%%time
num_cols_df = grp_num_cols(train_df)

In [None]:
num_cols_df.to_pickle('num_cols_df.pkl', compression='gzip')

In [None]:
del(num_cols_df)

In [None]:
cat_cols_df = pd.read_pickle('./cat_cols_df.pkl', compression='gzip')
num_cols_df = pd.read_pickle('./num_cols_df.pkl', compression='gzip')
final_train_df = pd.concat([cat_cols_df, num_cols_df], axis=1)

In [None]:
del(cat_cols_df)
del(num_cols_df)
gc.collect()

In [None]:
final_train_df.to_pickle('train_agg.pkl', compression='gzip')