In [1]:
%conda install pandas numpy seaborn matplotlib xgboost

Channels:
 - conda-forge
 - nvidia
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 24.7.1
    latest version: 24.11.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, PowerTransformer
from datetime import datetime
from os.path import exists

In [3]:
class FeatureProcessor:
    @staticmethod
    def calculate_distance(df):
        a = np.sin((np.radians(df['merch_lat']) - np.radians(df['lat']))/2)**2 + np.cos(np.radians(df['lat']))*np.cos(np.radians(df['merch_lat']))*np.sin((np.radians(df['merch_long']) - np.radians(df['long']))/2)**2
        distance = 6371 * (2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)))
        df['distance_to_merchant'] = distance
        return df

    @staticmethod
    def add_time_features(df):
        df['trans_datetime'] = pd.to_datetime(df['trans_date'] + ' ' + df['trans_time'])
        df['hour'] = df['trans_datetime'].dt.hour
        df['day_of_week'] = df['trans_datetime'].dt.dayofweek
        df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)
        df['is_night'] = df['hour'].between(23,6).astype(int)
        return df

    @staticmethod
    def compute_amt_per_capita(df):
        df['amt_per_capita'] = df['amt'] / (df['city_pop']+1)
        return df

    @staticmethod
    def compute_age(df):
        df['age'] = (pd.to_datetime('today') - pd.to_datetime(df['dob'])).dt.days // 365
        return df

    @staticmethod
    def categorize_age(df):
        df['age_group'] = pd.cut(df['age'], bins=[0,20,40,60,80,100], labels=['0-20','21-40','41-60','61-80','81-100'])
        return df

    @staticmethod
    def user_avg_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        user_avg = df_train_75.groupby('cc_num')['amt'].mean().reset_index()
        user_avg.columns = ['cc_num', 'avg_transaction_amt']
        df_train = df_train.merge(user_avg, on='cc_num', how='left')
        df_test = df_test.merge(user_avg, on='cc_num', how='left')
        overall_avg = df_train_75['amt'].mean()
        df_train['avg_transaction_amt'].fillna(overall_avg, inplace=True)
        df_test['avg_transaction_amt'].fillna(overall_avg, inplace=True)
        return df_train, df_test

    @staticmethod
    def user_std_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        user_std = df_train_75.groupby('cc_num')['amt'].std().reset_index()
        user_std.columns = ['cc_num','std_dev_transaction_amt']
        df_train = df_train.merge(user_std, on='cc_num', how='left')
        df_test = df_test.merge(user_std, on='cc_num', how='left')
        overall_std = df_train_75['amt'].std()
        df_train['std_dev_transaction_amt'].fillna(overall_std, inplace=True)
        df_test['std_dev_transaction_amt'].fillna(overall_std, inplace=True)
        return df_train, df_test

    @staticmethod
    def merchant_avg_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        merch_avg = df_train_75.groupby('merchant')['amt'].mean().reset_index()
        merch_avg.columns = ['merchant','avg_amount_per_merchant']
        df_train = df_train.merge(merch_avg, on='merchant', how='left')
        df_test = df_test.merge(merch_avg, on='merchant', how='left')
        overall_avg = df_train_75['amt'].mean()
        df_train['avg_amount_per_merchant'].fillna(overall_avg, inplace=True)
        df_test['avg_amount_per_merchant'].fillna(overall_avg, inplace=True)
        return df_train, df_test

    @staticmethod
    def merchant_std_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        merch_std = df_train_75.groupby('merchant')['amt'].std().reset_index()
        merch_std.columns = ['merchant','std_dev_amount_per_merchant']
        df_train = df_train.merge(merch_std, on='merchant', how='left')
        df_test = df_test.merge(merch_std, on='merchant', how='left')
        overall_std = df_train_75['amt'].std()
        df_train['std_dev_amount_per_merchant'].fillna(overall_std, inplace=True)
        df_test['std_dev_amount_per_merchant'].fillna(overall_std, inplace=True)
        return df_train, df_test

    @staticmethod
    def category_avg_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        cat_avg = df_train_75.groupby('category')['amt'].mean().reset_index()
        cat_avg.columns = ['category','avg_amount_per_category']
        df_train = df_train.merge(cat_avg, on='category', how='left')
        df_test = df_test.merge(cat_avg, on='category', how='left')
        overall_avg = df_train_75['amt'].mean()
        df_train['avg_amount_per_category'].fillna(overall_avg, inplace=True)
        df_test['avg_amount_per_category'].fillna(overall_avg, inplace=True)
        return df_train, df_test

    @staticmethod
    def category_std_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        cat_std = df_train_75.groupby('category')['amt'].std().reset_index()
        cat_std.columns = ['category','std_dev_amount_per_category']
        df_train = df_train.merge(cat_std, on='category', how='left')
        df_test = df_test.merge(cat_std, on='category', how='left')
        overall_std = df_train_75['amt'].std()
        df_train['std_dev_amount_per_category'].fillna(overall_std, inplace=True)
        df_test['std_dev_amount_per_category'].fillna(overall_std, inplace=True)
        return df_train, df_test

    @staticmethod
    def job_avg_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        job_avg = df_train_75.groupby('job')['amt'].mean().reset_index()
        job_avg.columns = ['job','avg_amount_per_job']
        df_train = df_train.merge(job_avg, on='job', how='left')
        df_test = df_test.merge(job_avg, on='job', how='left')
        overall_avg = df_train_75['amt'].mean()
        df_train['avg_amount_per_job'].fillna(overall_avg, inplace=True)
        df_test['avg_amount_per_job'].fillna(overall_avg, inplace=True)
        return df_train, df_test

    @staticmethod
    def job_std_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        job_std = df_train_75.groupby('job')['amt'].std().reset_index()
        job_std.columns = ['job','std_dev_amount_per_job']
        df_train = df_train.merge(job_std, on='job', how='left')
        df_test = df_test.merge(job_std, on='job', how='left')
        overall_std = df_train_75['amt'].std()
        df_train['std_dev_amount_per_job'].fillna(overall_std, inplace=True)
        df_test['std_dev_amount_per_job'].fillna(overall_std, inplace=True)
        return df_train, df_test

    @staticmethod
    def user_median_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        user_median = df_train_75.groupby('cc_num')['amt'].median().reset_index()
        user_median.columns = ['cc_num','user_median_transaction_amt']
        df_train = df_train.merge(user_median, on='cc_num', how='left')
        df_test = df_test.merge(user_median, on='cc_num', how='left')
        overall_med = df_train_75['amt'].median()
        df_train['user_median_transaction_amt'].fillna(overall_med, inplace=True)
        df_test['user_median_transaction_amt'].fillna(overall_med, inplace=True)
        return df_train, df_test

    @staticmethod
    def merchant_median_amt(df_train, df_test):
        df_train_75 = df_train.sample(frac=0.75, random_state=0)
        merch_median = df_train_75.groupby('merchant')['amt'].median().reset_index()
        merch_median.columns = ['merchant','median_amount_per_merchant']
        df_train = df_train.merge(merch_median, on='merchant', how='left')
        df_test = df_test.merge(merch_median, on='merchant', how='left')
        overall_med = df_train_75['amt'].median()
        df_train['median_amount_per_merchant'].fillna(overall_med, inplace=True)
        df_test['median_amount_per_merchant'].fillna(overall_med, inplace=True)
        return df_train, df_test

    @classmethod
    def apply_all_features(cls, train, test):
        train = cls.calculate_distance(train)
        test = cls.calculate_distance(test)

        train = cls.add_time_features(train)
        test = cls.add_time_features(test)

        train = cls.compute_amt_per_capita(train)
        test = cls.compute_amt_per_capita(test)

        train = cls.compute_age(train)
        test = cls.compute_age(test)

        train = cls.categorize_age(train)
        test = cls.categorize_age(test)

        train, test = cls.user_avg_amt(train, test)
        train, test = cls.user_std_amt(train, test)
        train, test = cls.merchant_avg_amt(train, test)
        train, test = cls.merchant_std_amt(train, test)
        train, test = cls.category_avg_amt(train, test)
        train, test = cls.category_std_amt(train, test)
        train, test = cls.job_avg_amt(train, test)
        train, test = cls.job_std_amt(train, test)
        train, test = cls.user_median_amt(train, test)
        train, test = cls.merchant_median_amt(train, test)

        return train, test

print("-")

-


In [4]:
########################################
# Load Data
########################################

train_set = pd.read_csv("train.csv")
test_set = pd.read_csv("test.csv")

print("train.csv shape:", train_set.shape)
print("test.csv shape:", test_set.shape)

train_set, test_set = FeatureProcessor.apply_all_features(train_set, test_set)

unnecessary_cols = ['trans_date','trans_time','trans_datetime','first','last','street','city','zip','dob','cc_num']
train_set.drop(unnecessary_cols, axis=1, inplace=True, errors='ignore')
test_set.drop(unnecessary_cols, axis=1, inplace=True, errors='ignore')

print("-")

train.csv shape: (370703, 24)
test.csv shape: (92676, 23)
-


In [5]:
cat_cols = ['trans_num','category','gender','state','job','merchant','age_group']
for col in cat_cols:
    train_set[col] = train_set[col].astype(str)
    test_set[col] = test_set[col].astype(str)
    le = LabelEncoder()
    combined_values = list(train_set[col]) + list(test_set[col])
    le.fit(combined_values)
    train_set[col] = le.transform(train_set[col])
    test_set[col] = le.transform(test_set[col])

print("-")

-


In [6]:
X_full = train_set.drop(['id','is_fraud'], axis=1)
Y_full = train_set['is_fraud']

X_train, X_val, Y_train, Y_val = train_test_split(X_full, Y_full, test_size=0.25, random_state=0)

print("-")

-


In [7]:
xgb_model = XGBClassifier(n_estimators=200, max_depth=10, random_state=0, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, Y_train)
Y_pred_xgb = xgb_model.predict(X_val)
score = f1_score(Y_val, Y_pred_xgb)

print(f"score: {score}")

score: 0.9770529110563246


In [8]:
X_test_final = test_set.drop(['id'], axis=1)
test_predictions = xgb_model.predict(X_test_final)

submission = pd.DataFrame({'id': test_set['id'], 'is_fraud': test_predictions})
submission.to_csv('final.csv', index=False)

print("<>")

<>
