In [1]:
# Import necessary ibraries

import pandas as pd
import numpy as np

In [2]:
# Loading the data files
df_1 = pd.read_csv("mnt/data/reference.csv")
df_2 = pd.read_csv("mnt/data/analysis.csv")

# merging the analysis and reference files
df = pd.concat([df_1, df_2])

# printing the no.of rows and no.of columns
print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

# converting the timestamp into date and adding the date column
df['date'] = pd.to_datetime(df['timestamp']).dt.date

print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])
df.head()

Number of rows:  90174
Number of columns:  7
Number of rows:  90174
Number of columns:  8


Unnamed: 0,timestamp,time_since_login_min,transaction_amount,transaction_type,is_first_transaction,user_tenure_months,is_fraud,date
0,2018-01-01 00:00:00.000,1.56175,3981.1,PAYMENT,False,0.31898,1.0,2018-01-01
1,2018-01-01 00:08:43.152,1.658074,1267.9,PAYMENT,False,7.391323,0.0,2018-01-01
2,2018-01-01 00:17:26.304,2.454287,1984.7,CASH-IN,False,0.781225,1.0,2018-01-01
3,2018-01-01 00:26:09.456,2.392085,2265.2,CASH-OUT,False,0.680473,1.0,2018-01-01
4,2018-01-01 00:34:52.608,2.189806,2126.8,CASH-IN,False,8.542895,1.0,2018-01-01


In [3]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Creation of time-based features
df['transaction_hour'] = df.timestamp.dt.hour
df['transaction_day'] = df.timestamp.dt.dayofweek

# Creation of Log based features
df['amount_log'] = np.log1p(df['transaction_amount'])

# Creation binary flags
df['high_amount_flag'] = (df['transaction_amount'] > df['transaction_amount'].median()).astype(int)


# Interaction Feature creation
df['amount_x_time'] = df['transaction_amount'] * df['time_since_login_min']

# Replacing null values of transaction_type with its mode
df['transaction_type'].fillna(df['transaction_type'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['transaction_type'].fillna(df['transaction_type'].mode()[0], inplace=True)


In [4]:
df.isnull().sum().sum()

np.int64(0)

In [5]:
df.head()

Unnamed: 0,timestamp,time_since_login_min,transaction_amount,transaction_type,is_first_transaction,user_tenure_months,is_fraud,date,transaction_hour,transaction_day,amount_log,high_amount_flag,amount_x_time
0,2018-01-01 00:00:00.000,1.56175,3981.1,PAYMENT,False,0.31898,1.0,2018-01-01,0,0,8.289565,1,6217.481774
1,2018-01-01 00:08:43.152,1.658074,1267.9,PAYMENT,False,7.391323,0.0,2018-01-01,0,0,7.145906,0,2102.272115
2,2018-01-01 00:17:26.304,2.454287,1984.7,CASH-IN,False,0.781225,1.0,2018-01-01,0,0,7.593727,0,4871.023843
3,2018-01-01 00:26:09.456,2.392085,2265.2,CASH-OUT,False,0.680473,1.0,2018-01-01,0,0,7.72586,1,5418.551255
4,2018-01-01 00:34:52.608,2.189806,2126.8,CASH-IN,False,8.542895,1.0,2018-01-01,0,0,7.662844,0,4657.279051


In [7]:
df.transaction_amount.describe()

count    90174.000000
mean      2976.259837
std       2039.138059
min       1001.100000
25%       1451.325000
50%       2205.900000
75%       3866.400000
max      11428.725000
Name: transaction_amount, dtype: float64

In [8]:
df.user_tenure_months.describe()

count    90174.000000
mean         6.891086
std          3.460898
min          0.002595
25%          4.177948
50%          8.369595
75%          9.796785
max         10.496517
Name: user_tenure_months, dtype: float64

In [9]:
df_2 = df.copy()

In [10]:
# Bucketed Features Creation
# Goal: Make your model see behavioural patterns instead of only raw inputs


# Creating tenure_bucket
df_2['tenure_bucket'] = pd.cut(
    df['user_tenure_months'],
    bins=[-1, 1, 3, 6, 12, float('inf')],
    labels=['new', 'recent', 'established', 'loyal', 'veteran']
)

# Creation amount_bucket feature
df['amount_bucket'] = pd.cut(
    df['transaction_amount'],
    bins=[-1, 1000, 3000, 6000, float('inf')],
    labels=['small', 'medium', 'large', 'very_large']
)

# Creation time_since_login_bucket feature
df['time_since_login_bucket'] = pd.cut(
    df['time_since_login_min'],
    bins=[-1, 1.5, 2.5, float('inf')],
    labels=['low', 'medium', 'high']
)


In [13]:
df_2['transaction_type'] = df_2['transaction_type'].map({'PAYMENT':1, 'CASH-OUT':2, 'CASH-IN':3, 'TRANSFER':4})
df_2['is_first_transaction'] = df_2['is_first_transaction'].map({False:0, True:1})