# **Feature Engineering**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/tenx/Fraud_Detection_For_E_commerce_and_Bank_Transactions/notebooks

/content/drive/MyDrive/tenx/Fraud_Detection_For_E_commerce_and_Bank_Transactions/notebooks


**Import Libraries**

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from IPython.display import display

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

In [6]:
class FraudFeatureEngineer:
    def __init__(self, fraud_data_path, creditcard_data_path, seed=42):
        self.fraud_data_path = fraud_data_path
        self.creditcard_data_path = creditcard_data_path
        self.seed = seed
        self.scaler = MinMaxScaler()
        self.label_encoders = {}

    def load_data(self, file_path):
        print(f"Loading data from {file_path}...")
        return pd.read_csv(file_path)

    def process_fraud_data(self):
        print("Processing fraud data...")
        df = self.load_data(self.fraud_data_path)

        print("Converting timestamps and extracting time-based features...")
        df["signup_time"] = pd.to_datetime(df["signup_time"])
        df["purchase_time"] = pd.to_datetime(df["purchase_time"])
        df["hour_of_day"] = df["purchase_time"].dt.hour
        df["day_of_week"] = df["purchase_time"].dt.weekday
        df["time_since_signup"] = (
            df["purchase_time"] - df["signup_time"]
        ).dt.total_seconds()

        print("Calculating transaction frequency and velocity...")
        df["transaction_count"] = df.groupby("user_id")["user_id"].transform("count")
        df["purchase_time_unix"] = df["purchase_time"].astype(np.int64) // 10**9
        df["velocity"] = df.groupby("device_id")["purchase_time_unix"].diff().fillna(0)

        print("Dropping unnecessary columns...")
        df.drop(
            columns=[
                "user_id",
                "signup_time",
                "purchase_time",
                "device_id",
                "ip_address",
                "purchase_time_unix",
            ],
            inplace=True,
        )

        print("Encoding categorical features...")
        categorical_columns = ["source", "browser", "sex", "country"]
        for col in categorical_columns:
            self.label_encoders[col] = LabelEncoder()
            df[col] = self.label_encoders[col].fit_transform(df[col])

        print("Normalizing numerical features...")
        numerical_columns = [
            "purchase_value",
            "age",
            "hour_of_day",
            "day_of_week",
            "time_since_signup",
            "transaction_count",
            "velocity",
        ]
        df[numerical_columns] = self.scaler.fit_transform(df[numerical_columns])

        print("First 3 rows of transformed fraud data:")
        display(df.head(3))

        print("Splitting and saving data...")
        X = df.drop(columns=["class"])
        y = df["class"]
        return self.split_and_save(X, y, "fraud_data")

    def process_creditcard_data(self):
        print("Processing credit card data...")
        df = self.load_data(self.creditcard_data_path)

        print("Normalizing features V1 to V28 and Amount...")
        feature_columns = [f"V{i}" for i in range(1, 29)] + ["Amount"]
        df[feature_columns] = self.scaler.fit_transform(df[feature_columns])

        print("First 3 rows of transformed credit card data:")
        display(df.head(3))

        print("Splitting and saving data...")
        X = df.drop(columns=["Class"])
        y = df["Class"]
        return self.split_and_save(X, y, "creditcard_data")

    def split_and_save(self, X, y, filename):
        print(f"Splitting data for {filename}...")
        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, test_size=0.3, random_state=self.seed, stratify=y
        )
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp, test_size=0.5, random_state=self.seed, stratify=y_temp
        )

        print(f"Saving split datasets for {filename}...")
        np.save(f"../Data/processed/{filename}_X_train.npy", X_train)
        np.save(f"../Data/processed/{filename}_X_val.npy", X_val)
        np.save(f"../Data/processed/{filename}_X_test.npy", X_test)
        np.save(f"../Data/processed/{filename}_y_train.npy", y_train)
        np.save(f"../Data/processed/{filename}_y_val.npy", y_val)
        np.save(f"../Data/processed/{filename}_y_test.npy", y_test)

        print("Data processing complete.")

# **Data Loading**

In [7]:
Credit_data_path = "../Data/processed/cleaned_credit_df.csv"
Fraud_data_path = "../Data/processed/merged_fraud_df.csv"

In [8]:
processor = FraudFeatureEngineer(Fraud_data_path, Credit_data_path)

# **Feature Engineering**

In [9]:
processor.process_fraud_data()

Processing fraud data...
Loading data from ../Data/processed/merged_fraud_df.csv...
Converting timestamps and extracting time-based features...
Calculating transaction frequency and velocity...
Dropping unnecessary columns...
Encoding categorical features...
Normalizing numerical features...
First 3 rows of transformed fraud data:


Unnamed: 0.1,Unnamed: 0,purchase_value,source,browser,sex,age,class,country,hour_of_day,day_of_week,time_since_signup,transaction_count,velocity
0,0,0.3125,2,0,1,0.538462,0,84,0.086957,0.833333,0.434673,0.0,0.504068
1,1,0.0875,0,0,0,0.897436,0,171,0.043478,0.0,0.001731,0.0,0.504068
2,2,0.075,2,3,1,0.897436,1,171,0.782609,0.5,0.0,0.0,0.504068


Splitting and saving data...
Splitting data for fraud_data...
Saving split datasets for fraud_data...
Data processing complete.


In [10]:
processor.process_creditcard_data()

Processing credit card data...
Loading data from ../Data/processed/cleaned_credit_df.csv...
Normalizing features V1 to V28 and Amount...
First 3 rows of transformed credit card data:


Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,0.0,0.432237,0.469159,0.822691,0.725409,0.442469,0.638949,0.551394,0.518831,0.578229,0.533324,0.408293,0.323307,0.309453,0.406192,0.791061,0.374109,0.57077,0.505937,0.610117,0.710742,0.501654,0.566628,0.416407,0.508551,0.541883,0.435634,0.690431,0.435569,0.005824,0
1,1,0.0,0.812012,0.52966,0.513267,0.579162,0.518961,0.522077,0.480501,0.512471,0.453398,0.468175,0.76976,0.734555,0.591806,0.451777,0.622034,0.609963,0.479341,0.453645,0.459868,0.478278,0.376515,0.352628,0.587539,0.380255,0.556338,0.574536,0.470241,0.503803,0.000105,0
2,2,1.0,0.432453,0.242927,0.723051,0.56841,0.410807,0.926031,0.674268,0.58851,0.19954,0.562856,0.604716,0.490413,0.635327,0.445739,0.969237,0.493148,0.826268,0.469147,0.499637,0.909169,0.66228,0.681937,0.505845,0.270044,0.371191,0.457689,0.398612,0.361762,0.014739,0


Splitting and saving data...
Splitting data for creditcard_data...
Saving split datasets for creditcard_data...
Data processing complete.


In [11]:
np.load("../Data/processed/creditcard_data_X_val.npy").shape

(42559, 31)