In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
# Models
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
train = pd.read_parquet("../input/amex-parquet/train_data.parquet")

In [None]:
train.columns

In [None]:
train.describe()

# Missing Column Analysis

In [None]:
missing_df = train.isnull().sum().reset_index()
missing_df = missing_df.rename(columns={"index":"columns",0:"missing_value"})
missing_df = missing_df.query("missing_value>0")

In [None]:
missing_df.info()

In [None]:
print(missing_df.describe())
print(missing_df.max())
print(missing_df.min())

In [None]:
fig = px.bar(y=missing_df["columns"],x=missing_df["missing_value"])
fig.update_layout(showlegend=False, 
                  title_text="Column Wise Null Value Distribution", 
                  title_x=0.5,
                  xaxis_title="Missing Value Count",
                  yaxis_title="Column Name")
fig.show()


# Observations:

* We have 122 NULL value columns in our train dataset. Almost 63% of columns in our dataset have NULL values.

* Column B_13 has the least number of null values (1 NULL value)

* Column S_9 has the maximum number of null values (5527586)





# Different feature types

Features are anonymized and normalized, and fall into the following general categories:

D_* = Delinquency variables

S_* = Spend variables

P_* = Payment variables

B_* = Balance variables

R_* = Risk variables

In [None]:
d_feats = [c for c in train.columns if c.startswith('D_')]
s_feats = [c for c in train.columns if c.startswith('S_')]
p_feats = [c for c in train.columns if c.startswith('P_')]
b_feats = [c for c in train.columns if c.startswith('B_')]
r_feats = [c for c in train.columns if c.startswith('R_')]

In [None]:
dtypes = train.dtypes.reset_index()
dtypes = dtypes.rename(columns={"index":"Column_name",0:"dtype_name"})
dtypes = dtypes.groupby(by=["dtype_name"]).size().reset_index(name="counts")
dtypes

We have 4 object columns, 185 float and 2 integer columns 

# Target Analysis

In [None]:
target_ana = train.groupby(by=["target"]).size().reset_index(name="counts")
fig = px.bar(data_frame=target_ana,x="target",y="counts",color = 'target')
fig.show()

# Observations

Our training dataset has unequal target distribution.

We have lesser number of defaulters than the ones who do not default, which does make sense considering a real world scenario.

Using stratified k fold could be a strategy we employ while training the model.

# Card statements per user

In [None]:
target_cust = train.groupby(by=["customer_ID"]).size().reset_index(name="counts")
target_cust = target_cust.groupby(by=["counts"]).size().reset_index(name="number_per_count")
fig = px.pie(target_cust,names="counts",values="number_per_count",title="NUmber of statements per customer id")
fig.show()

# Observations

Most of the customers almost 84% have 13 card statements (almost over a year)

But for some customers we have statements for only a single month.

In that case we would need to have a strategy that takes this into consideration while modelling.

# Visualising categorical object features

In [None]:
train.select_dtypes(include=['object'])

In [None]:
target_by_date = train.groupby(by=["S_2"]).size().reset_index(name="counts")

In [None]:
px.line(target_by_date,x="S_2",y="counts",title="Number of statements generated by date")

# Observations

On analysing the pattern from months March to May 2017, I could see that in a week most statements are generated on a  Saturday and there is a considerable drop in the number of statements generated on Sundays, after which throughout the week the number of statements show an upward trend maxing on Saturday.

In [None]:
target_by_d63 = train.groupby(by=["D_63"]).size().reset_index(name="counts")

In [None]:
target_by_d63

In [None]:
px.bar(target_by_d63,x="D_63",y="counts",color="D_63",title="Distribution of D_63")

# Observations:

Delinquency means minor crime, these variables seem to be depicting some sort of negligience by the credit card holder.

CO is the category with the most number of counts.

In [None]:
target_by_d64 = train.groupby(by=["D_64"]).size().reset_index(name="counts")

In [None]:
px.bar(target_by_d64,x="D_64",y="counts",color="D_64",title="Distribution of D_64")

# Visualising integer features

In [None]:
train.select_dtypes(include=['int'])

In [None]:
target_by_b31 = train.groupby(by=["B_31"]).size().reset_index(name="counts")

In [None]:
px.bar(target_by_b31,x="B_31",y="counts",color="B_31",title="Distribution of B_31")

# Correlation of variables with the target feature

In [None]:
corr = train.sample(frac=0.1, random_state=42).corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
plt.figure(figsize=(11, 9))
sns.heatmap(corr,mask=mask)

**Some features are highly correlated.**

In [None]:
#Code reference: https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on

iters = range(len(corr.columns) - 1)
drop_cols = []
threshold = 0.9
print("Highly correlated features with their correlation values")
    # Iterate through the correlation matrix and compare correlations
for i in iters:
        for j in range(i+1):
            item = corr.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])


**Let's remove the highly correlated features**

In [None]:
# Drop one of each pair of correlated columns
drops = set(drop_cols)
#train = train.drop(columns=drops)


In [None]:
train.info()

# Modelling

**Reference:https://www.kaggle.com/code/munumbutt/simple-lgbm-starter**

In [None]:
%%time
# Keep the last statement month per customer
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327094
train =  (train
            .groupby('customer_ID')
            .tail(1)
            .set_index('customer_ID', drop=True)
            .sort_index()
            .drop(['S_2'], axis='columns'))

In [None]:
train.shape

In [None]:
total_cols = train.columns.to_list()
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

num_cols = [col for col in total_cols if col not in cat_cols + ["target"]]

In [None]:
X = train[cat_cols+num_cols]
y = train["target"]



In [None]:
%%time
enc = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan)
X[cat_cols] = enc.fit_transform(X[cat_cols])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [None]:
clf = LGBMClassifier(
    n_estimators=50000,
    random_state=72,
    extra_trees=True
)

In [None]:
%%time
clf.fit(
    X_train, y_train, 
    eval_set=[(X_test,y_test)],
    callbacks=[early_stopping(50), log_evaluation(0)]
)

In [None]:
import gc
del train, X, y, X_test, X_train, y_train, y_test
_ = gc.collect()

In [None]:
#del test

In [None]:
test = pd.read_feather("../input/amexfeather/test_data.ftr")

In [None]:
test =  (
    test
    .groupby('customer_ID')
    .tail(1)
    .set_index('customer_ID', drop=True)
    .sort_index()
    .drop(['S_2'], axis='columns')
)
test[cat_cols] = enc.transform(test[cat_cols])
test["prediction"] = clf.predict_proba(test[cat_cols + num_cols])[:,1]
test.head()

In [None]:
pred = pd.DataFrame()
#pred["customer_ID"] = test["customer_ID"]
pred["prediction"] = test["prediction"]

In [None]:
test["prediction"].to_csv("submission.csv", index=True)

# Work in Progress