# 1. Libraries

In [3]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import joblib

# 2. Data Processing

In [4]:
#importing data
df_transactional = pd.read_csv('transactional-sample.csv')

#verifying data
df_transactional.head()

Unnamed: 0,transaction_id,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id,has_cbk
0,21320398,29744,97051,434505******9116,2019-12-01T23:16:32.812632,374.56,285475.0,False
1,21320399,92895,2708,444456******4210,2019-12-01T22:45:37.873639,734.87,497105.0,True
2,21320400,47759,14777,425850******7024,2019-12-01T22:22:43.021495,760.36,,False
3,21320401,68657,69758,464296******3991,2019-12-01T21:59:19.797129,2556.13,,True
4,21320402,54075,64367,650487******6116,2019-12-01T21:30:53.347051,55.36,860232.0,False


In [5]:
#verifying types
df_transactional.dtypes

Unnamed: 0,0
transaction_id,int64
merchant_id,int64
user_id,int64
card_number,object
transaction_date,object
transaction_amount,float64
device_id,float64
has_cbk,bool


In [6]:
#verifying if has chargeback
df_transactional.has_cbk.value_counts()

Unnamed: 0_level_0,count
has_cbk,Unnamed: 1_level_1
False,2808
True,391


In [7]:
#creating a data that users have chargeback
df_has_cbk = df_transactional[df_transactional['has_cbk'] == True]

#verifying data
df_has_cbk

Unnamed: 0,transaction_id,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id,has_cbk
1,21320399,92895,2708,444456******4210,2019-12-01T22:45:37.873639,734.87,497105.0,True
3,21320401,68657,69758,464296******3991,2019-12-01T21:59:19.797129,2556.13,,True
7,21320405,56107,81152,650516******9201,2019-12-01T21:24:05.608374,188.68,486.0,True
8,21320406,56107,81152,650516******9201,2019-12-01T21:13:21.529999,352.77,486.0,True
9,21320407,56107,81152,650516******9201,2019-12-01T21:04:55.066909,345.68,486.0,True
...,...,...,...,...,...,...,...,...
3157,21323555,41354,19820,606282******6581,2019-11-02T18:04:22.088172,4028.55,,True
3165,21323563,41354,19820,606282******6581,2019-11-02T16:33:21.333131,4031.00,,True
3178,21323576,16266,96495,522840******2045,2019-11-01T22:10:49.517024,762.18,,True
3180,21323578,16266,96495,552305******4782,2019-11-01T20:54:04.014855,2936.66,,True


In [8]:
#creating a new data that users not have chargeback
df_no_cbk = df_transactional[df_transactional['has_cbk'] == False]

#creating a new data with same length that positive chargeback
df_no_cbk = df_no_cbk.sample(n=len(df_has_cbk))

#verifying data
df_no_cbk

Unnamed: 0,transaction_id,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id,has_cbk
2947,21323345,49205,20892,550209******6838,2019-11-08T20:17:48.913076,666.57,,False
3170,21323568,93881,19559,544890******8868,2019-11-02T14:07:48.969239,713.54,,False
998,21321396,13772,50407,409603******2679,2019-11-28T13:52:27.043800,261.37,312720.0,False
870,21321268,26765,21048,540593******8676,2019-11-28T19:21:29.046160,448.81,266715.0,False
2898,21323296,57903,39273,515590******9873,2019-11-09T17:49:45.787340,381.01,,False
...,...,...,...,...,...,...,...,...
2407,21322805,49205,57935,550209******5255,2019-11-18T18:14:59.178890,235.03,438285.0,False
2430,21322828,39528,43730,422061******8940,2019-11-18T16:50:14.448892,166.62,598119.0,False
2212,21322610,42711,30806,550209******8666,2019-11-21T03:05:14.306610,10.46,373850.0,False
349,21320747,25496,11383,511623******5542,2019-11-30T14:19:59.930299,569.31,620419.0,False


In [9]:
#concating datas
df_new_data_transactional = pd.concat([df_has_cbk, df_no_cbk], axis=0)

#reseting the index
df_new_data_transactional.reset_index(inplace=True)

#verifying data
df_new_data_transactional

Unnamed: 0,index,transaction_id,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id,has_cbk
0,1,21320399,92895,2708,444456******4210,2019-12-01T22:45:37.873639,734.87,497105.0,True
1,3,21320401,68657,69758,464296******3991,2019-12-01T21:59:19.797129,2556.13,,True
2,7,21320405,56107,81152,650516******9201,2019-12-01T21:24:05.608374,188.68,486.0,True
3,8,21320406,56107,81152,650516******9201,2019-12-01T21:13:21.529999,352.77,486.0,True
4,9,21320407,56107,81152,650516******9201,2019-12-01T21:04:55.066909,345.68,486.0,True
...,...,...,...,...,...,...,...,...,...
777,2407,21322805,49205,57935,550209******5255,2019-11-18T18:14:59.178890,235.03,438285.0,False
778,2430,21322828,39528,43730,422061******8940,2019-11-18T16:50:14.448892,166.62,598119.0,False
779,2212,21322610,42711,30806,550209******8666,2019-11-21T03:05:14.306610,10.46,373850.0,False
780,349,21320747,25496,11383,511623******5542,2019-11-30T14:19:59.930299,569.31,620419.0,False


In [10]:
#creating a validating data with chargeback
df_val_has_cbk = df_new_data_transactional.head(5)

#verifying data
df_val_has_cbk

Unnamed: 0,index,transaction_id,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id,has_cbk
0,1,21320399,92895,2708,444456******4210,2019-12-01T22:45:37.873639,734.87,497105.0,True
1,3,21320401,68657,69758,464296******3991,2019-12-01T21:59:19.797129,2556.13,,True
2,7,21320405,56107,81152,650516******9201,2019-12-01T21:24:05.608374,188.68,486.0,True
3,8,21320406,56107,81152,650516******9201,2019-12-01T21:13:21.529999,352.77,486.0,True
4,9,21320407,56107,81152,650516******9201,2019-12-01T21:04:55.066909,345.68,486.0,True


In [11]:
#creating a validating data with not have chargeback
df_val_no_cbk = df_new_data_transactional.tail(5)

#verifying data
df_val_no_cbk

Unnamed: 0,index,transaction_id,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id,has_cbk
777,2407,21322805,49205,57935,550209******5255,2019-11-18T18:14:59.178890,235.03,438285.0,False
778,2430,21322828,39528,43730,422061******8940,2019-11-18T16:50:14.448892,166.62,598119.0,False
779,2212,21322610,42711,30806,550209******8666,2019-11-21T03:05:14.306610,10.46,373850.0,False
780,349,21320747,25496,11383,511623******5542,2019-11-30T14:19:59.930299,569.31,620419.0,False
781,2887,21323285,96553,72722,606282******1389,2019-11-09T19:29:12.835573,140.13,,False


In [12]:
#removing data positions of validate
df_new_data_transactional = df_new_data_transactional.iloc[5:]
df_new_data_transactional = df_new_data_transactional[:-5]

#reseting the index
df_new_data_transactional.reset_index(inplace=True)

#verifying data
df_new_data_transactional

Unnamed: 0,level_0,index,transaction_id,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id,has_cbk
0,5,13,21320411,56107,81152,650516******7004,2019-12-01T20:36:55.091278,32.86,486.0,True
1,6,26,21320424,53816,5541,606282******3381,2019-12-01T19:31:20.047571,2597.51,656429.0,True
2,7,28,21320426,11470,5541,606282******3381,2019-12-01T19:26:01.352512,2511.43,656429.0,True
3,8,30,21320428,73271,7725,489391******7420,2019-12-01T19:22:45.419831,2092.79,308950.0,True
4,9,32,21320430,56977,69758,464296******3991,2019-12-01T19:17:21.731168,2803.32,,True
...,...,...,...,...,...,...,...,...,...,...
767,772,161,21320559,70425,64038,606282******4642,2019-11-30T22:35:25.977390,1021.29,668489.0,False
768,773,84,21320482,3057,12621,523284******6827,2019-12-01T14:48:54.837619,149.14,590486.0,False
769,774,634,21321032,38964,7019,550209******5496,2019-11-29T15:29:48.271988,209.73,669683.0,False
770,775,2024,21322422,63467,7593,523421******2660,2019-11-22T01:32:19.407724,494.73,486041.0,False


In [13]:
#concating datas
df_val_total = pd.concat([df_val_has_cbk, df_val_no_cbk], axis=0)

#reseting the index
df_val_total.reset_index(inplace=True)

#removing previous chargeback status
df_val_total = df_val_total.drop(['has_cbk'], axis=1)

#verifying data
df_val_total

Unnamed: 0,level_0,index,transaction_id,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id
0,0,1,21320399,92895,2708,444456******4210,2019-12-01T22:45:37.873639,734.87,497105.0
1,1,3,21320401,68657,69758,464296******3991,2019-12-01T21:59:19.797129,2556.13,
2,2,7,21320405,56107,81152,650516******9201,2019-12-01T21:24:05.608374,188.68,486.0
3,3,8,21320406,56107,81152,650516******9201,2019-12-01T21:13:21.529999,352.77,486.0
4,4,9,21320407,56107,81152,650516******9201,2019-12-01T21:04:55.066909,345.68,486.0
5,777,2407,21322805,49205,57935,550209******5255,2019-11-18T18:14:59.178890,235.03,438285.0
6,778,2430,21322828,39528,43730,422061******8940,2019-11-18T16:50:14.448892,166.62,598119.0
7,779,2212,21322610,42711,30806,550209******8666,2019-11-21T03:05:14.306610,10.46,373850.0
8,780,349,21320747,25496,11383,511623******5542,2019-11-30T14:19:59.930299,569.31,620419.0
9,781,2887,21323285,96553,72722,606282******1389,2019-11-09T19:29:12.835573,140.13,


# 3. Functions for generic processing

In [14]:
#applying fraud detection rules
def preprocess_transactions(df: pd.DataFrame) -> pd.DataFrame:
    #converting and sorting datetime
    df['transaction_date'] = pd.to_datetime(df['transaction_date'], errors='coerce')
    df = df.sort_values(['user_id', 'transaction_date'])

    #verifying previous chargeback
    if 'has_cbk' not in df.columns:
        df['has_cbk'] = False

    #creating a new column with user's purchases count
    df['user_tx_count'] = df.groupby('user_id').cumcount()

    #creating a new column that counts the user's average spending
    df['user_mean_amount'] = df.groupby('user_id')['transaction_amount'].transform(
        lambda x: x.expanding().mean()
    )

    #defining rules 1: max range of purchase about user history
    df['rule_high_amount'] = (
        (df['user_tx_count'] >= 1) &
        (df['transaction_amount'] > 1.5 * df['user_mean_amount'])
    ).astype(int)

    #defining rules 2: same merchant with minimium interval
    df['prev_merchant'] = df.groupby('user_id')['merchant_id'].shift(1)
    df['prev_amount'] = df.groupby('user_id')['transaction_amount'].shift(1)
    df['prev_time'] = df.groupby('user_id')['transaction_date'].shift(1)

    df['minutes_diff'] = (df['transaction_date'] - df['prev_time']).dt.total_seconds() / 60
    df['rule_quick_repeat'] = (
        (df['merchant_id'] == df['prev_merchant']) &
        (df['minutes_diff'] <= 10) &
        (abs(df['transaction_amount'] - df['prev_amount']) / df['prev_amount'] <= 0.05)
    ).astype(int)

    #defining rules 3: time permission, purchase value permission, has chargeback and has device
    df['hour'] = df['transaction_date'].dt.hour
    df['has_device'] = df['device_id'].notnull()
    df['has_cbk'] = df['has_cbk'].astype(str).str.upper() == 'TRUE'

    df['rule_night_purchase'] = (
        ((df['hour'] >= 20) | (df['hour'] < 6)) &
        (df['transaction_amount'] >= 2500) &
        (df['has_cbk'] | ~df['has_device'])
    ).astype(int)

    #rules of fraud label
    df['is_fraud'] = (
        (df['rule_high_amount'] == 1) |
        (df['rule_quick_repeat'] == 1) |
        (df['rule_night_purchase'] == 1)
    ).astype(int)

    return df

In [15]:
#creating a user history with data information
def build_user_features(df: pd.DataFrame) -> pd.DataFrame:

    #identifying the time between each purchase
    def avg_time_between_purchases(group):
        if len(group) > 1:
            return group['transaction_date'].diff().mean().total_seconds() / 3600
        return np.nan

    #identifying repeat merchants
    def most_common_merchant(group):
        mode = group['merchant_id'].mode()
        return mode.iloc[0] if not mode.empty else 'None'

    #identifying night time shopping
    def count_night_transactions(group):
        return ((group['transaction_date'].dt.hour >= 20) | (group['transaction_date'].dt.hour < 6)).sum()

    #grouping rules in user history
    def extract_user_features(group):
        return pd.Series({
            'total_purchases': len(group),
            'average_value': group['transaction_amount'].mean(),
            'maximum_value': group['transaction_amount'].max(),
            'minimum_value': group['transaction_amount'].min(),
            'value_deviation': group['transaction_amount'].std(),
            'average_time_between_purchases_hours': avg_time_between_purchases(group),
            'quant_merchants': group['merchant_id'].nunique(),
            'most_common_merchant': most_common_merchant(group),
            'night_transaction': count_night_transactions(group),
            'cbk_count': group['has_cbk'].sum(),
            'high_amount_freq': group['rule_high_amount'].mean(),
            'quick_repeat_freq': group['rule_quick_repeat'].mean(),
            'night_purchase_freq': group['rule_night_purchase'].mean()
        })

    user_features = df.groupby('user_id', group_keys=False).apply(extract_user_features).reset_index()

    #identifying fraud history
    if 'is_fraud' in df.columns:
        user_fraud_history = df.groupby('user_id')['is_fraud'].max().reset_index()
        user_features = user_features.merge(user_fraud_history, on='user_id')
    else:
        user_features['is_fraud'] = 0

    return user_features

In [16]:
#incrementing my previous rules in a general pipeline
def process_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    df_processed = preprocess_transactions(df)
    user_features = build_user_features(df_processed)
    return user_features

In [17]:
#receiving the dataframe and manipulating it with my pipeline
df = df_new_data_transactional
user_features = process_pipeline(df)

  user_features = df.groupby('user_id', group_keys=False).apply(extract_user_features).reset_index()


# 4. Model Training

In [18]:
#defining, treating and standardizing models
X = user_features.drop(columns=['user_id', 'is_fraud'])
Y = user_features['is_fraud']

X = X.fillna(0)

In [19]:
#splitting data train and data test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [20]:
#setting my categorical and numeric columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [21]:
#training logistic regression model
model = LogisticRegression()
model.fit(X_train, Y_train)

In [22]:
#predicting on the test set
Y_pred = model.predict(X_test)

#evaluating the model
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

print(f"Accuracy: {accuracy *100:.2f}")
print(f"Precision: {precision *100:.2f}")
print(f"Recall: {recall *100:.2f}")
print(f"F1-Score: {f1 *100:.2f}")

Accuracy: 99.06
Precision: 85.71
Recall: 100.00
F1-Score: 92.31


In [23]:
#saving model and preprocessor
joblib.dump(model, "anti_fraud_model.pkl")
joblib.dump(preprocessor, "preprocessor.pkl")

['preprocessor.pkl']

# 5. Testing model

In [None]:
#manipulating a random dataframe for extra testing
def recommendation(df):
    transaction_id = df['transaction_id'].values
    df_proc = process_pipeline(df).drop(columns=['is_fraud'], errors='ignore')
    df_proc = df_proc.fillna(0)

    #removing id
    df_proc_transformed = preprocessor.transform(df_proc.drop(columns=['user_id', 'transaction_id'], errors='ignore'))
    result = model.predict(df_proc_transformed)

    for i, r in enumerate(result):
        status = "DENY" if r == 1 else "APPROVE"
        print(f"Transação {transaction_id[i]} - {status}")

recommendation(df_val_total)

Transação 21320399 - APPROVE
Transação 21320401 - APPROVE
Transação 21320405 - APPROVE
Transação 21320406 - APPROVE
Transação 21320407 - APPROVE
Transação 21322805 - DENY
Transação 21322828 - APPROVE
Transação 21322610 - DENY


  user_features = df.groupby('user_id', group_keys=False).apply(extract_user_features).reset_index()
