In [1]:
# Create the .kaggle folder in your root directory
! mkdir ~/.kaggle

# Write kaggle API credentials to kaggle.json
! echo '{"username":"sujaysingh","key":"42a9ae7e76a172e439a1f7b5cd51e527"}' > ~/.kaggle/kaggle.json

# Set permissions
! chmod 600 ~/.kaggle/kaggle.json

# Install the kaggle library
! pip install kaggle



In [2]:
# Download dataset
! kaggle datasets download munumbutt/amexfeather

Dataset URL: https://www.kaggle.com/datasets/munumbutt/amexfeather
License(s): CC0-1.0
Downloading amexfeather.zip to /content
100% 12.6G/12.7G [01:40<00:00, 146MB/s]
100% 12.7G/12.7G [01:40<00:00, 135MB/s]


In [3]:
# Check files in zip
! unzip -l amexfeather.zip

Archive:  amexfeather.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
3550100394  2022-05-31 20:09   test_data.ftr
6946760602  2022-05-31 20:16   test_data_f32.ftr
1725867466  2022-05-31 20:29   train_data.ftr
3372385378  2022-05-31 20:32   train_data_f32.ftr
---------                     -------
15595113840                     4 files


In [4]:
# Unzip dataset from zip file
! unzip amexfeather.zip train_data.ftr

Archive:  amexfeather.zip
  inflating: train_data.ftr          


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import resample
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [7]:
import gc
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import gc
import cudf # Replace pandas with cudf
# import cupy as cp # Replace numpy with cupy
# from cuml.preprocessing import OneHotEncoder # Replace scikit-learn with cuML

In [8]:
def dummy_variables_categorical(cat_features, cat_names):
    enc = OneHotEncoder(sparse_output=False)
    encoded_df = cudf.DataFrame(enc.fit_transform(cat_features), columns=enc.get_feature_names_out(cat_names), index=cat_features.index)
    return encoded_df

In [9]:

# read in data and set index to customer ID
df = pd.read_feather('train_data.ftr').sample(frac=0.25, random_state=42)
df = df.set_index('customer_ID')

# get X and y; drop dates from X
X = df.drop(['S_2', 'target'], axis=1)
y = df['target']

# delete original dataframe from memory
del df
gc.collect()

# encode categorical features
# One-Hot Encode Categorical Variables
categorical_cols = ['B_30', 'B_38', 'D_63', 'D_64', 'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']
categorical_cols = [col for col in categorical_cols if col in X.columns]
X[categorical_cols] = X[categorical_cols].fillna(X[categorical_cols].mode())
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [10]:
 # simple impute numerical columns with mean()
numeric_cols = X.select_dtypes(include=['number']).columns
X[numeric_cols] = X[numeric_cols].astype('float32')#converting to float32 before calculating the mean
X[numeric_cols]=X[numeric_cols].fillna(X[numeric_cols].mean())

In [11]:
unique_customer_ids = list(X.index.unique())

In [12]:
train_ids, test_ids = train_test_split(list(unique_customer_ids), test_size=0.2, random_state=42)

In [13]:
X_train = X.drop(["D_66"],axis=1).loc[X.index.isin(train_ids)]
X_test = X.drop(["D_66"],axis=1).loc[X.index.isin(test_ids)]
y_train = y.loc[y.index.isin(train_ids)]
y_test = y.loc[y.index.isin(test_ids)]

In [14]:
del X
gc.collect()
del y
gc.collect()

0

In [None]:
# Train a Logistic Regression model
lr_model = LogisticRegression(random_state=42, class_weight=None, max_iter=1000)
lr_model.fit(X_train, y_train)


In [15]:
from sklearn.metrics import average_precision_score,precision_recall_curve,precision_score,precision_recall_fscore_support,accuracy_score,f1_score
from sklearn.preprocessing import MultiLabelBinarizer

import os
import re
import joblib

def PERFORMANCE(predictions,actuals,cut_off=0.5):
	if issubclass(predictions.dtype.type, np.floating):
		predictions=(predictions>=cut_off).astype('int')
	performance=[accuracy_score(actuals,predictions),*precision_recall_fscore_support(actuals,predictions,pos_label=1,average="binary")[:-1]]
	return performance

def all_performance_metrics(performance_input, model_name,csv_path=None,cut=0.5):
	y,pred,data_,=performance_input
	performance=PERFORMANCE(pred,y,cut_off=cut)
	#
	perf=[
	[model_name,data_]+performance,
	]
	columns=['model_name','data','accuracy','precision','recall','f1_score']
	perf_df=pd.DataFrame(perf,columns=columns)
	if csv_path is not None:
		perf_df.to_csv(csv_path, mode='a', header=not os.path.isfile(csv_path),index=False)
	else:
		return perf_df
def union_all_perf(y_train,y_train_pred, y_test, y_test_pred,model_name):
    traindf = all_performance_metrics((y_train,y_train_pred,"train"),model_name)
    testdf = all_performance_metrics((y_test,y_test_pred,"test"),model_name)
    return pd.concat([traindf,testdf],ignore_index=True)

In [None]:

y_train_pred_lg = lr_model.predict(X_train)
y_test_pred_lg = lr_model.predict(X_test)

In [None]:
union_all_perf(y_train,y_train_pred_lg,y_test,y_test_pred_lg,"LogisticRegression")

Unnamed: 0,model_name,data,accuracy,precision,recall,f1_score
0,LogisticRegression,train,0.871933,0.759281,0.708195,0.732849
1,LogisticRegression,test,0.870198,0.757909,0.706005,0.731037


In [None]:
# X.to_csv("amex_X.csv")
# y.to_csv("amex_y.csv")

In [None]:
# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)#, class_weight='balanced'
rf_model.fit(X_train, y_train)


In [None]:
rf_train_pred = rf_model.predict(X_train)
rf_test_pred = rf_model.predict(X_test)

In [None]:
union_all_perf(y_train,rf_train_pred,y_test,rf_test_pred,"RandomForest")

Unnamed: 0,model_name,data,accuracy,precision,recall,f1_score
0,RandomForest,train,0.99999,1.0,0.99996,0.99998
1,RandomForest,test,0.870307,0.74747,0.726311,0.736739


In [None]:
# XGBoost Model
xgb_model = XGBClassifier(random_state=42,max_depth=5,min_child_weight=5,n_estimators=500)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
print("XGBoost Report:\n", classification_report(y_test, xgb_preds))


XGBoost Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92    206396
           1       0.75      0.75      0.75     68746

    accuracy                           0.88    275142
   macro avg       0.84      0.83      0.83    275142
weighted avg       0.88      0.88      0.88    275142



In [None]:
xgb_train_pred = xgb_model.predict(X_train)
xgb_test_pred = xgb_model.predict(X_test)

In [None]:
union_all_perf(y_train,xgb_train_pred,y_test,xgb_test_pred,"XGBoost")

Unnamed: 0,model_name,data,accuracy,precision,recall,f1_score
0,XGBoost,train,0.903809,0.809988,0.799809,0.804866
1,XGBoost,test,0.87589,0.754757,0.745512,0.750106


In [None]:
# LightGBM Model
lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(X_train, y_train)
lgbm_preds = lgbm_model.predict(X_test)
print("LightGBM Report:\n", classification_report(y_test, lgbm_preds))

[LightGBM] [Info] Number of positive: 274752, number of negative: 832969
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.499780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43988
[LightGBM] [Info] Number of data points in the train set: 1107721, number of used features: 210
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248034 -> initscore=-1.109128
[LightGBM] [Info] Start training from score -1.109128
LightGBM Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92    206396
           1       0.75      0.75      0.75     68746

    accuracy                           0.88    275142
   macro avg       0.83      0.83      0.83    275142
weighted avg       0.88      0.88      0.88    275142



In [None]:
lgbm_train_pred = lgbm_model.predict(X_train)
lgbm_test_pred = lgbm_model.predict(X_test)

In [None]:
union_all_perf(y_train,lgbm_train_pred,y_test,lgbm_test_pred,"LGBM Model")

Unnamed: 0,model_name,data,accuracy,precision,recall,f1_score
0,LGBM Model,train,0.87946,0.756237,0.758513,0.757373
1,LGBM Model,test,0.875806,0.751078,0.752247,0.751662


In [None]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators=[('lr', lr_model), ('xgb', xgb_model), ('lgbm', lgbm_model)], voting='soft')
vc.fit(X_train, y_train)

NameError: name 'lr_model' is not defined

In [None]:
vc_train_pred = vc.predict(X_train)
vc_test_pred = vc.predict(X_test)

NameError: name 'vc' is not defined

In [None]:
union_all_perf(y_train,vc_train_pred,y_test,vc_test_pred,"Voting Classifier Model")

Unnamed: 0,model_name,data,accuracy,precision,recall,f1_score
0,Voting Classifier Model,train,0.888329,0.780801,0.764358,0.772492
1,Voting Classifier Model,test,0.877314,0.76059,0.742778,0.751579


In [17]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'max_depth': [ -1,  15],
    'learning_rate': [ 0.1, 0.3]
}
lgbm_model = LGBMClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=lgbm_model,
    param_grid=param_grid,
    scoring='f1',  # Use F1-score for imbalanced data
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and model evaluation
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [None]:
best_model = grid_search.best_estimator_
best_lgbm_train_preds = best_model.predict(X_train)
best_lgbm_test_preds = best_model.predict(X_test)

In [None]:
union_all_perf(y_train,best_lgbm_train_preds,y_test,best_lgbm_test_preds,"GridSearchCV LGB Model")