In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install pyspark

In [None]:
import pandas as pd
import numpy as np
import os
from pprint import pprint
# from pyspark.sql import SparkSession, types
from pandasql import sqldf

# kaggle utils
import kaggle_utils_py as kaggle_utils

# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# set the warning off
import warnings
warnings.filterwarnings("ignore")

In [None]:
#  basic settings for me
pd.set_option('display.max_columns', None)

**Read Data**

In [None]:
%%time
# data load
train = pd.read_feather('../input/amex-default-prediction-feather/train.feather')
test = pd.read_feather('../input/amex-default-prediction-feather/test.feather')
train_labels = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
sub = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')

In [None]:
print("shape of the data --->", train.shape)
print("shape of the data label --->", train_labels.shape)
print("shape of the test data --->", test.shape)

In [None]:
train.head()

In [None]:
train_labels.head()

In [None]:
test.head()

In [None]:
# variable counts 
d_feats = [c for c in train.columns if c.startswith('D_')]
s_feats = [c for c in train.columns if c.startswith('S_')]
p_feats = [c for c in train.columns if c.startswith('P_')]
b_feats = [c for c in train.columns if c.startswith('B_')]
r_feats = [c for c in train.columns if c.startswith('R_')]
print(f'Number of Delinquency variables: {len(d_feats)}')
print(f'Number of Spend variables: {len(s_feats)}')
print(f'Number of Payment variables: {len(p_feats)}')
print(f'Number of Balance variables: {len(b_feats)}')
print(f'Number of Risk variables: {len(r_feats)}')
print(f'Total variable counts: {len(d_feats)+ len(s_feats)+ len(p_feats) + len(b_feats) + len(r_feats)}')

**Data Analysis - Customer**

In [None]:
# Customer info
unique_customer_count = len(train.groupby("customer_ID")['customer_ID'].count())
print("unique customer data in training data -->", unique_customer_count)
unique_customer_label_count = len(train_labels.groupby("customer_ID")['customer_ID'].count())
print("unique customer data in training label data -->", unique_customer_label_count)
unique_customer_count_test = len(test.groupby("customer_ID")['customer_ID'].count())
print("unique customer data in test data -->", unique_customer_count_test)

In [None]:
# checking single customer data
each_customer = train.groupby("customer_ID").size()
print(each_customer)
print('Distinct count for customer data')
each_customer.unique() # count of each customer data

In [None]:
# Look at one customer data
train[train["customer_ID"] == "0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a"]

In [None]:
# count customer number for train and test 
y = train.groupby("customer_ID")['customer_ID'].count().values
y_test = test.groupby("customer_ID")['customer_ID'].count().values
print(y, y_test)

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(
    y = y,
    ybins = dict(size = 0.5),
    marker_color= '#9900cc'))
fig.update_layout(
    template = "plotly_dark",
    title = "Customer profile count -- training data",
    yaxis_title = "Number of months",
    bargap = 0.2
)
fig.show()

fig = go.Figure()
fig.add_trace(go.Histogram(
    y = y_test,
    ybins = dict(size = 0.5),
    marker_color= '#9900cc'))
fig.update_layout(
    template = "plotly_dark",
    title = "Customer profile count -- test data",
    yaxis_title = "Number of months"
)
fig.show()

# dsitribution of profile length is common between train and test data.

In [None]:
# connection between the profile length and target output
# match between the customer_id, count for each customer_id, and target 
count = train.groupby("customer_ID")['customer_ID'].count()
customer_count_target_df = pd.DataFrame({"customer_ID":count.index, "count": count.values})
# merge the data with the label data frame
customer_count_target_df = customer_count_target_df.merge(train_labels, on='customer_ID', how='left')
customer_count_target_df

In [None]:
sns.countplot(data = customer_count_target_df,y='count',hue='target', orient='h')
# profile length and target doesn't seem to have a huge correlation: each profile length has about 30-50% that are target 1


**Data Analysis - Feature**

In [None]:
train

In [None]:
# merge the train and train label
train2 = train.groupby('customer_ID').tail(1).set_index('customer_ID')
data = train2.merge(train_labels, on='customer_ID', how='left')

In [None]:
data

In [None]:
# Type for each column/feature 
columns, categorical_col, numerical_col,missing_value_df = kaggle_utils.Common_data_analysis(train, missing_value_highlight_threshold=5.0, display_df = False,only_show_missing=False)

In [None]:
# by dataset defenistion descrete columns are 
# Categorical Data: ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
descrete_cols=['B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68',
          'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'target']

# so numerical columns we need to check
numerical_col = [c for c in numerical_col if c not in descrete_cols]

target_col = 'target'

#all categorial columns are stored in categorical_col
categorical_col.extend(descrete_cols)

In [None]:
# null value analysis
print("shape of missing value df", missing_value_df.shape)
missing_value_df.head()

**Distribution Analysis for features **

In [None]:
def plot_hist(data, columns, nrow, ncol, figsize, hue_value=None):
    # find the distubution of the data.
    fig, ax = plt.subplots(nrow,ncol, figsize=figsize)
    col, row = ncol,nrow
    col_count = 0
    sns.set_style('dark')
    for r in range(row):
        for c in range(col):
            if col_count >= len(columns):
                ax[r,c].text(0.5, 0.5, "no data")
            else:
                sns.kdeplot(data=data, x=columns[col_count], hue=hue_value, ax=ax[r, c], palette=['#9900cc','#99ff99'],
                                fill = True, hue_order=[1,0], legend = True)
                ax[r,c].set(xlabel = columns[col_count], ylabel=("Density" if c==0 else ''))
                col_count +=1
        # print("col count ", col_count)

In [None]:
# Risk variable
# Find the distribution of risk variables
r_feats = [c for c in r_feats if c not in descrete_cols]
plot_hist(data, r_feats, 8, 4, (50,50),hue_value=target_col)
#### Can't see any feature following normal distribution
#### We can't use parameterised models -- best go for some non-parameterised models

In [None]:
# correlation with target
#col = [c for c in data.columns if data[c].dtypes != 'object']

corr = data.corrwith(data[target_col], axis=0)
val = [str(round(v ,2) *100) + '%' for v in corr.values]

fig = go.Figure()
fig.add_trace(go.Bar(y=corr.index, x= corr.values,
                     orientation='h',
                     marker_color = '#9900cc',
                     text = val,
                     textposition = 'outside',
                     textfont_color = '#ffff80'))
fig.update_layout(template = 'plotly_dark',
                  title = "Correlation with Target",
                  width = 800,
                  height = 3000)
fig.update_xaxes(range=[-2,2])

# negative correlation top 5: P_2 -67%, B_2 -56%, B_18 -55%, B_33 -52%, D_62 -37% 
# postive correlation top 5: B_9 54%, D_55 54%, D_44 53%, D_61 53%, B_3 51%

**Target 0/1 distribution**

In [None]:
# plot the target
count = data[target_col].value_counts()
print(count)
print("percentage of not default --- >",count[0]/data.shape[0])
print("percentage of default --->", count[1]/data.shape[0])
fig = go.Figure()
fig.add_trace(go.Bar(x= ['Paid', "Default"],y=count.values,
                     marker_color = ['#9900cc','#ffff80'],
                     text = [str(round(count[0]/data.shape[0],2) * 100) + '%' , str(round(count[1]/data.shape[0], 2) * 100) + '%']))
fig.update_layout(template = 'plotly_dark',
                  title = "target value distribution",
                  width = 500,
                  height = 500)

**Appendix**

In [None]:
# # Spark Session
# # In order to use pyspark we need to create or get spark instance.
# spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
# # Data Path: # need to check if we can use others generated parquet file below
# train_data_path = '../input/amex-default-prediction/train_data.csv'
# train_labels_path = '../input/amex-default-prediction/train_labels.csv'
# test_data_path = '../input/amex-default-prediction/test_data.csv'
# submission_sample_path = '../input/amex-default-prediction/sample_submission.csv'

In [None]:
# # Load Data
# df_train = spark.read.option("header", "true").csv(train_data_path)
# df_train_label = spark.read.option("header", "true").csv(train_labels_path)
# df_test = spark.read.option("header", "true").csv(test_data_path)

In [None]:
# df_train_label.show()
# df_train.show()

In [None]:
# newdf = spark.read.format("csv").option("header", "true").load(train_data_path)

In [None]:
# display(newdf)

In [None]:
# prettydf = newdf.toPandas()

In [None]:
# # Data Path: # need to check if we can use others generated parquet file below
# train_data_path = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
# train_labels_path = '../input/amex-default-prediction/train_labels.csv'
# test_data_path = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'
# submission_sample_path = '../input/amex-default-prediction/sample_submission.csv'

# # Load Data: 
# train_data = pd.read_parquet(train_data_path)
# train_labels = pd.read_csv(train_labels_path)
# test_data = pd.read_parquet(test_data_path)
# submission = pd.read_csv(submission_sample_path)

# print(train_data.shape, train_labels.shape)
# print(test_data.shape, submission.shape)

In [None]:
# 5 features: 
# D_* = Delinquency variables - Bojun
# S_* = Spend variables - Cecilia 
# P_* = Payment variables - Yinuo
# B_* = Balance variables - Hanjing
# R_* = Risk variables - Dora

In [None]:
# Appendix:
# Join Zoom Meeting
# https://mit.zoom.us/j/3705116583