In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install kaggle_utils_py

In [None]:
import pandas as pd
import numpy as np
import os
from pprint import pprint
# from pyspark.sql import SparkSession, types
from pandasql import sqldf

# # kaggle utils
import kaggle_utils_py as kaggle_utils

# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# set the warning off
import warnings
warnings.filterwarnings("ignore")

In [None]:
#  basic settings for me
pd.set_option('display.max_columns', None)

**Read Data**

In [None]:
%%time
# data load
train = pd.read_feather('../input/amex-default-prediction-feather/train.feather')
test = pd.read_feather('../input/amex-default-prediction-feather/test.feather')
train_labels = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
sub = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')

In [None]:
print("shape of the data --->", train.shape)
print("shape of the data label --->", train_labels.shape)
print("shape of the test data --->", test.shape)

In [None]:
train.head()

In [None]:
train_labels.head()

In [None]:
test.head()

In [None]:
# variable counts 
d_feats = [c for c in train.columns if c.startswith('D_')]
s_feats = [c for c in train.columns if c.startswith('S_')]
p_feats = [c for c in train.columns if c.startswith('P_')]
b_feats = [c for c in train.columns if c.startswith('B_')]
r_feats = [c for c in train.columns if c.startswith('R_')]
print(f'Number of Delinquency variables: {len(d_feats)}')
print(f'Number of Spend variables: {len(s_feats)}')
print(f'Number of Payment variables: {len(p_feats)}')
print(f'Number of Balance variables: {len(b_feats)}')
print(f'Number of Risk variables: {len(r_feats)}')
print(f'Total variable counts: {len(d_feats)+ len(s_feats)+ len(p_feats) + len(b_feats) + len(r_feats)}')

Data Analysis - Payment

In [None]:
def process_and_feature_engineer(df, col):
    # INSPIRED BY
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created

    df = df.groupby("customer_ID")[[col]].agg(['min', 'max', 'last', 'count'])
    df.columns = ['_'.join(x) for x in df.columns]
    print('shape after engineering', df.shape )
    
    return df

In [None]:
# !pip install cuda
# !pip install cudf


In [None]:
import cudf # for GPU Lib
def add_targets(train):
    targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
    targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    targets = targets.set_index('customer_ID')
    train = train.merge(targets, left_index=True, right_index=True, how='left')
    del targets
    return train

In [None]:
NAN_VALUE = -127 # will fit in int8

def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    year = cudf.to_numeric(df.S_2.str[:4])
    month = cudf.to_numeric(df.S_2.str[5:7])
    df.S_2 = year.mul(12).add(month).sub(24207).astype('int8')
    # FILL NAN
    print("NAN Count:",df['P_2'].isnull().sum(axis = 0),df['P_3'].isnull().sum(axis = 0),df['P_4'].isnull().sum(axis = 0))
    df = df.fillna(NAN_VALUE) 
    print('shape of data:', df.shape)
    
    return df

In [None]:
#P_2
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train_base = read_file(path = TRAIN_PATH)
for col in ["P_2", "P_3", "P_4"]:
    df = process_and_feature_engineer(train_base,col)
    df = add_targets(df)
    df = df.to_pandas()
    df = df.sort_index()
    df = df.reset_index()
    display(df)

In [None]:
# Too many 0s in P_4, is there any Correlation between P_4 Count and target?
df = train_base[train_base["P_4"]!=0]
col = "P_4"
df = process_and_feature_engineer(df,col)
df = add_targets(df)
df = df.to_pandas()
df = df.sort_index()
df = df.reset_index()
display(df)

In [None]:
#Validation for guess
from scipy.stats import f_oneway
 
# Running the one-way anova test between target and P_4_count
# Assumption(H0) is that target and P_4_count are NOT correlated
dfLists=df.groupby('target')['P_4_count'].apply(list)
 
# Performing the ANOVA test
# We accept the Assumption(H0) only when P-Value &gt; 0.05
AnovaResults = f_oneway(*dfLists)
print('P-Value for Anova is: ', AnovaResults[1])
if AnovaResults[1] < 0.05:
    print("reject H0, they have a Correlation")

In [None]:
import seaborn as sns
sns.scatterplot(data=df, x='P_4_count', y='target')
#有关系, 但是关系没那么大

In [None]:
# Look at one customer data
train[train["customer_ID"] == "0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a"][["S_2","P_2","P_3","P_4"]]

Questions waiting for meeting
* Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.
-> What we should do for this comments