In [1]:
import pandas as pd
df = pd.read_csv("data/creditcard.csv")
df.head()

Unnamed: 0,transaction_id,amount,transaction_hour,merchant_category,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud
0,1,84.47,22,Electronics,0,0,66,3,40,0
1,2,541.82,3,Travel,1,0,87,1,64,0
2,3,237.01,17,Grocery,0,0,49,1,61,0
3,4,164.33,4,Grocery,0,1,72,3,34,0
4,5,30.53,15,Food,0,0,79,0,44,0


In [2]:
print("Shape before cleaning:", df.shape)
df.info()



Shape before cleaning: (10000, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   transaction_id       10000 non-null  int64  
 1   amount               10000 non-null  float64
 2   transaction_hour     10000 non-null  int64  
 3   merchant_category    10000 non-null  object 
 4   foreign_transaction  10000 non-null  int64  
 5   location_mismatch    10000 non-null  int64  
 6   device_trust_score   10000 non-null  int64  
 7   velocity_last_24h    10000 non-null  int64  
 8   cardholder_age       10000 non-null  int64  
 9   is_fraud             10000 non-null  int64  
dtypes: float64(1), int64(8), object(1)
memory usage: 781.4+ KB


In [3]:
df.isnull().sum()



transaction_id         0
amount                 0
transaction_hour       0
merchant_category      0
foreign_transaction    0
location_mismatch      0
device_trust_score     0
velocity_last_24h      0
cardholder_age         0
is_fraud               0
dtype: int64

In [5]:
df.drop_duplicates(inplace=True)

print("Shape after removing duplicates:", df.shape)


Shape after removing duplicates: (10000, 10)


In [6]:
df = df[df['amount'] >= 0]


In [7]:
def get_time_category(hour):
    if hour < 6:
        return "Night"
    elif hour < 12:
        return "Morning"
    elif hour < 18:
        return "Afternoon"
    else:
        return "Evening"

df['time_category'] = df['transaction_hour'].apply(get_time_category)


In [8]:
df['amount_level'] = pd.cut(
    df['amount'],
    bins=[0,50,200,500,1000],
    labels=['Low','Medium','High','Very High']
)


In [9]:
df.head()


Unnamed: 0,transaction_id,amount,transaction_hour,merchant_category,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud,time_category,amount_level
0,1,84.47,22,Electronics,0,0,66,3,40,0,Evening,Medium
1,2,541.82,3,Travel,1,0,87,1,64,0,Night,Very High
2,3,237.01,17,Grocery,0,0,49,1,61,0,Afternoon,High
3,4,164.33,4,Grocery,0,1,72,3,34,0,Night,Medium
4,5,30.53,15,Food,0,0,79,0,44,0,Afternoon,Low


In [10]:
from sklearn.preprocessing import StandardScaler


In [11]:
cols_to_normalize = [
    'amount',
    'device_trust_score',
    'velocity_last_24h',
    'cardholder_age'
]


In [12]:
scaler = StandardScaler()

df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


In [13]:
df.head()


Unnamed: 0,transaction_id,amount,transaction_hour,merchant_category,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud,time_category,amount_level
0,1,-0.521597,22,Electronics,0,0,0.195528,0.691873,-0.23158,0,Evening,Medium
1,2,2.086108,3,Travel,1,0,1.172909,-0.704299,1.370727,0,Night,Very High
2,3,0.348151,17,Grocery,0,0,-0.595686,-0.704299,1.170439,0,Afternoon,High
3,4,-0.066254,4,Grocery,0,1,0.474779,0.691873,-0.632157,0,Night,Medium
4,5,-0.829151,15,Food,0,0,0.800573,-1.402386,0.035471,0,Afternoon,Low
