In [1]:
import pandas as pd


# Feature Engineering

In [None]:
df = pd.read_csv('/content/fraud_payment_data.csv')

In [None]:
df_north_america = df[df['Sender_Country'].isin(['USA','CANADA'])]

In [76]:
# considering only north america region
# this can be treated as training data and other country can be used for testing

df_derived = df_north_america.dropna()

In [78]:
df_derived.shape

(832797, 15)

In [77]:
df_derived.isnull().sum()

Time_step           0
Transaction_Id      0
Sender_Id           0
Sender_Account      0
Sender_Country      0
Sender_Sector       0
Sender_lob          0
Bene_Id             0
Bene_Account        0
Bene_Country        0
USD_amount          0
Label               0
Transaction_Type    0
date                0
year                0
dtype: int64

### Geographical feature

In [81]:
df_derived['is_international'] = (df_derived['Sender_Country']!= df_derived['Bene_Country']).astype(int)

### Timing based features

In [82]:
df_derived['hour_of_transaction'] = (df_derived['Time_step'].dt.hour)

df_derived = df_derived.sort_values(by=['Sender_Id', 'Time_step'])
df_derived['time_since_last_transaction_sender'] = df_derived.groupby('Sender_Id')['Time_step'].diff()
df_derived['time_since_last_transaction_sender'] = df_derived['time_since_last_transaction_sender'].fillna(pd.Timedelta(seconds=0))
df_derived['time_since_last_transaction_sender'] = df_derived['time_since_last_transaction_sender'].dt.total_seconds() / 3600  # for hours


df_derived = df_derived.sort_values(by=['Bene_Id', 'Time_step'])
df_derived['time_since_last_transaction_bene'] = df_derived.groupby('Bene_Id')['Time_step'].diff()
df_derived['time_since_last_transaction_bene'] = df_derived['time_since_last_transaction_bene'].fillna(pd.Timedelta(seconds=0))
df_derived['time_since_last_transaction_bene'] = df_derived['time_since_last_transaction_bene'].dt.total_seconds() / 3600  # for hours


### Sender - Beneficiery based features

In [83]:
df_derived['sender_beneficiary_pair'] = df_derived['Sender_Id'] + '-' + df_derived['Bene_Id']

df_derived['new_bene_for_sender'] = ~df_derived.duplicated(subset=['sender_beneficiary_pair'], keep='first')
df_derived['new_bene_for_sender'] = df_derived['new_bene_for_sender'].astype(int)

df_derived.drop(['sender_beneficiary_pair'],axis=1, inplace=True)

In [84]:
unique_beneficiary_counts = df_derived.groupby('Sender_Id')['Bene_Id'].nunique()


beneficiary_count_df = unique_beneficiary_counts.reset_index(name='sender_bene_unique_count')

df_derived = df_derived.merge(beneficiary_count_df, on='Sender_Id', how='left')


### Amount based features

In [85]:
average_transaction_amount = df_derived.groupby('Sender_Id')['USD_amount'].mean()

df_derived['average_transaction_amount'] = df_derived.groupby('Sender_Id')['USD_amount'].transform('mean')

df_derived['transaction_amount_deviation_sender'] = df_derived['USD_amount'] - df_derived['average_transaction_amount']

df_derived = df_derived.drop(columns=['average_transaction_amount'],axis=1)

In [86]:
average_transaction_amount = df_derived.groupby('Bene_Id')['USD_amount'].mean()

df_derived['average_transaction_amount'] = df_derived.groupby('Bene_Id')['USD_amount'].transform('mean')

df_derived['transaction_amount_deviation_bene'] = df_derived['USD_amount'] - df_derived['average_transaction_amount']

df_derived = df_derived.drop(columns=['average_transaction_amount'],axis=1)

In [87]:
df_derived['is_round_amount'] = (df_derived['USD_amount'] % 100 == 0).astype(int)


In [88]:
average_sender_transaction = df_derived.groupby('Sender_Id')['USD_amount'].mean().reset_index()
average_sender_transaction.rename(columns={'USD_amount': 'average_sender_amount'}, inplace=True)


average_bene_transaction = df.groupby('Bene_Id')['USD_amount'].mean().reset_index()
average_bene_transaction.rename(columns={'USD_amount': 'average_bene_amount'}, inplace=True)

df_derived = df_derived.merge(average_sender_transaction, on='Sender_Id', how='left')
df_derived = df_derived.merge(average_bene_transaction, on='Bene_Id', how='left')


In [90]:
df_derived  = df_derived[['is_international','hour_of_transaction','time_since_last_transaction_sender','time_since_last_transaction_bene','new_bene_for_sender','sender_bene_unique_count','transaction_amount_deviation_sender','transaction_amount_deviation_bene','is_round_amount','average_sender_amount','average_bene_amount','Label']]

In [91]:
df_derived

Unnamed: 0,is_international,hour_of_transaction,time_since_last_transaction_sender,time_since_last_transaction_bene,new_bene_for_sender,sender_bene_unique_count,transaction_amount_deviation_sender,transaction_amount_deviation_bene,is_round_amount,average_sender_amount,average_bene_amount,Label
0,0,8,12.000000,0.000000,1,36,-233.525769,-423.2750,0,327.705769,517.4550,0
1,0,9,12.000000,25.333333,0,36,214.064231,24.3150,0,327.705769,517.4550,0
2,0,22,12.333333,12.333333,0,36,259.404231,69.6550,0,327.705769,517.4550,0
3,0,11,12.000000,133.166667,0,36,519.054231,329.3050,0,327.705769,517.4550,0
4,1,6,12.000000,0.000000,1,35,-335.343945,0.0000,0,368.843945,33.5000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
832792,0,8,0.166667,109.333333,0,35,536.896055,338.7700,0,368.843945,566.9700,1
832793,1,1,0.000000,0.000000,1,35,146.266055,131.9275,0,368.843945,383.1825,0
832794,1,16,0.333333,134.166667,0,35,-230.363945,-244.7025,0,368.843945,383.1825,0
832795,1,6,12.000000,182.166667,0,35,-138.813945,-153.1525,0,368.843945,383.1825,0
