# Feature analysis: S_2
In this notebook I simply take a closer look at the monthly statements for each customer. Feature S_2. I'm interested in:
* Converting to a number (useful for feature engineering).
* Ignoring the 'day'.
* Offsetting for test data to normalize with train data.
* Comparing different use cases and associated default rate.
    * Long-term customer: all 13 statements.
    * Short-term customer: fewer than 13 statements, all statements are consecutive.
    * Gap customer: fewer than 13 statements, statements are not consecutive.
* Comparing the count of these use cases between the train data and the two sets of test data.


I built this noteboook based on @cdeotte from [here][1], which in turn is built upon:
* Smaller dataset: @raddar Kaggle dataset from [here][2] with discussion [here][3]. 
* feature engineering: suggested by @huseyincot in his notebooks [here][4] and [here][5].
* GPU: Our feature engineering is performed using [RAPIDS][6] on the GPU to create new features quickly.

[1]: https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793
[2]: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
[3]: https://www.kaggle.com/competitions/amex-default-prediction/discussion/328514
[4]: https://www.kaggle.com/code/huseyincot/amex-catboost-0-793
[5]: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
[6]: https://rapids.ai/

# Load Libraries

In [None]:
# LOAD LIBRARIES
import pandas as pd, numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

print('RAPIDS version',cudf.__version__)

# Process and Feature Engineer Train Data
We will load @raddar Kaggle dataset from [here][1] with discussion [here][2]. Then we will engineer S_2, inspired by @huseyincot in his notebooks [here][3] and [here][4]. We will use [RAPIDS][5] and the GPU to create new features quickly.

[1]: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
[2]: https://www.kaggle.com/competitions/amex-default-prediction/discussion/328514
[3]: https://www.kaggle.com/code/huseyincot/amex-catboost-0-793
[4]: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
[5]: https://rapids.ai/

In [None]:
NAN_VALUE = -127 # will fit in int8

def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    year = cudf.to_numeric(df.S_2.str[:4])
    month = cudf.to_numeric(df.S_2.str[5:7])
    df.S_2 = year.mul(12).add(month).sub(24207).astype('int8')
    # FILL NAN
    df = df.fillna(NAN_VALUE) 
    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train_base = read_file(path = TRAIN_PATH)

In [None]:
def process_and_feature_engineer(df):
    # INSPIRED BY
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created

    df = df.groupby("customer_ID")[["S_2"]].agg(['min', 'max', 'last', 'count'])
    df.columns = ['_'.join(x) for x in df.columns]
    print('shape after engineering', df.shape )
    
    return df

df = process_and_feature_engineer(train_base)

df.head()

In [None]:
# ADD TARGETS
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793
def add_targets(train):
    targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
    targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    targets = targets.set_index('customer_ID')
    train = train.merge(targets, left_index=True, right_index=True, how='left')
    del targets
    return train

df = add_targets(df)
df = df.to_pandas()
df = df.sort_index()
df = df.reset_index()

df.head()

In [None]:
TOTAL = len(df)
TOTAL_Y = sum(df.target)
test = df[df['S_2_max'] == 12]
assert(len(test) == TOTAL)
test = df[df['S_2_last'] == 12]
assert(len(test) == TOTAL)
test = df[df['S_2_count'] == 13]
longterm = len(test)
longterm_y = sum(test.target)
test = df[df['S_2_min'] > 0]
short = test[test['S_2_max'] - test['S_2_min'] == test['S_2_count'] - 1]
shortterm = len(short)
shortterm_y = sum(short.target)
gap = df[df['S_2_max'] - df['S_2_min'] > df['S_2_count'] - 1]
gapCount = len(gap)
gap_y = sum(gap.target)
print(longterm, shortterm, gapCount)
assert(longterm+gapCount+shortterm == TOTAL)
assert(longterm_y+gap_y+shortterm_y == TOTAL_Y)
for i in range(1, 14):
    test1 = short[short['S_2_count'] == i]
    test2 = gap[gap['S_2_count'] == i]
    if i == 1:
        print(i, "S", len(test1))
        print(i, "S", sum(test1.target)/len(test1))
    elif i < 13:
        print(i, "S", len(test1), "G", len(test2))
        print(i, "S", sum(test1.target)/len(test1), "G", sum(test2.target)/len(test2))
    else:
        print(i, "L", longterm)
        print(i, "L", longterm_y/longterm)

print(TOTAL_Y, longterm_y, shortterm_y, gap_y)
print("Rate: (all):", TOTAL_Y/TOTAL)
print("Long term:  ", longterm_y/longterm)
print("Short term: ", shortterm_y/shortterm)
print("Gap customer", gap_y/gapCount)


# Process and Feature Engineer Test Data
We will load @raddar Kaggle dataset from [here][1] with discussion [here][2]. Then we will engineer features suggested by @huseyincot in his notebooks [here][1] and [here][4]. We will use [RAPIDS][5] and the GPU to create new features quickly.

[1]: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
[2]: https://www.kaggle.com/competitions/amex-default-prediction/discussion/328514
[3]: https://www.kaggle.com/code/huseyincot/amex-catboost-0-793
[4]: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
[5]: https://rapids.ai/

In [None]:
# CALCULATE SIZE OF EACH SEPARATE TEST PART
def get_rows(customers, test, NUM_PARTS = 4, verbose = ''):
    chunk = len(customers)//NUM_PARTS
    if verbose != '':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    rows = []

    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = test.loc[test.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows,chunk

# COMPUTE SIZE OF 4 PARTS FOR TEST DATA
NUM_PARTS = 4
TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'

print(f'Reading test data...')
test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')

# Load Test

In [None]:
def loadTestData():
    # READ PART OF TEST DATA
    print(f'\nReading test data...')
    test = read_file(path = TEST_PATH)
    print(f'=> Test has shape', test.shape )
    
    # PROCESS AND FEATURE ENGINEER PART OF TEST DATA
    test = process_and_feature_engineer(test)
    return test

df = loadTestData()

In [None]:
def printStats(df, last=12):
    TOTAL = len(df)

    assert(len(df[df['S_2_max'] == last]) == TOTAL)
    assert(len(df[df['S_2_last'] == last]) == TOTAL)

    longterm = len(df[df['S_2_count'] == 13])

    test = df[df['S_2_min'] > last-12]
    short = test[test['S_2_max'] - test['S_2_min'] == test['S_2_count'] - 1]
    shortterm = len(short)

    gap = df[df['S_2_max'] - df['S_2_min'] > df['S_2_count'] - 1]
    gapCount = len(gap)

    print(longterm, shortterm, gapCount)
    assert(longterm+gapCount+shortterm == TOTAL)
    for i in range(1, 14):
        test1 = short[short['S_2_count'] == i]
        test2 = gap[gap['S_2_count'] == i]
        if i == 1:
            print(i, "S", len(test1))
        elif i < 13:
            print(i, "S", len(test1), "G", len(test2))
        else:
            print(i, "L", longterm)

GRAND_TOTAL = len(df)
test = df[df['S_2_max'] == 25]
TOTAL1 = len(test)
test = df[df['S_2_max'] == 31]
TOTAL2 = len(test)
assert(TOTAL1+TOTAL2 == GRAND_TOTAL)
printStats(df[df['S_2_max'] == 25], last=25)
printStats(df[df['S_2_max'] == 31], last=31)
