In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import scipy.stats as stats

In [3]:
train_transaction = pd.read_csv("data/train_transaction.csv")
train_identity = pd.read_csv("data/train_identity.csv")

# Merge both dataframes on 'TransactionID'
train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")

print(f"Rows in merged training set: {train.shape[0]}")
print(f"Columns in merged training set: {train.shape[1]}")

Rows in merged training set: 590540
Columns in merged training set: 434


In [4]:
train.isna().sum()

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
id_36             449555
id_37             449555
id_38             449555
DeviceType        449730
DeviceInfo        471874
Length: 434, dtype: int64

In [5]:
print("Percentage of fraud transactions:", (train["isFraud"].sum() / train['isFraud'].count()) * 100)

Percentage of fraud transactions: 3.4990009144173126


In [6]:
null_cols = [col for col in train.columns if train[col].isna().sum() > 0.9 * len(train)]
null_cols

['dist2',
 'D7',
 'id_07',
 'id_08',
 'id_18',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27']

In [7]:
missing_df = train.copy(deep=True)
for col in null_cols:
    missing_df["m_flag_"+col] = np.where(missing_df[col].isnull(), 1, 0)
    correlation = missing_df[["m_flag_"+col, 'isFraud']].corr()
    print(correlation)


              m_flag_dist2   isFraud
m_flag_dist2      1.000000 -0.091096
isFraud          -0.091096  1.000000
           m_flag_D7   isFraud
m_flag_D7   1.000000 -0.164478
isFraud    -0.164478  1.000000
              m_flag_id_07   isFraud
m_flag_id_07      1.000000 -0.024333
isFraud          -0.024333  1.000000
              m_flag_id_08   isFraud
m_flag_id_08      1.000000 -0.024333
isFraud          -0.024333  1.000000
              m_flag_id_18   isFraud
m_flag_id_18      1.000000 -0.074815
isFraud          -0.074815  1.000000
              m_flag_id_21  isFraud
m_flag_id_21       1.00000 -0.02431
isFraud           -0.02431  1.00000
              m_flag_id_22   isFraud
m_flag_id_22      1.000000 -0.024252
isFraud          -0.024252  1.000000
              m_flag_id_23   isFraud
m_flag_id_23      1.000000 -0.024252
isFraud          -0.024252  1.000000
              m_flag_id_24   isFraud
m_flag_id_24      1.000000 -0.024345
isFraud          -0.024345  1.000000
              m_flag_i

In [7]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [8]:
categorical_features = train.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    print(col, len(set(train[col])), set(train[col]))

ProductCD 5 {'W', 'H', 'R', 'S', 'C'}
card4 5 {'visa', nan, 'discover', 'american express', 'mastercard'}
card6 5 {'debit or credit', 'credit', nan, 'debit', 'charge card'}
P_emaildomain 60 {'yahoo.co.uk', 'comcast.net', 'icloud.com', 'mac.com', 'netzero.com', 'roadrunner.com', 'web.de', 'yahoo.fr', 'me.com', 'windstream.net', 'gmail.com', 'twc.com', 'live.fr', 'aol.com', 'servicios-ta.com', 'earthlink.net', 'gmx.de', 'cfl.rr.com', 'sbcglobal.net', 'mail.com', 'verizon.net', 'frontiernet.net', 'outlook.es', 'optonline.net', 'hotmail.com', 'yahoo.com.mx', 'frontier.com', nan, 'aim.com', 'protonmail.com', 'gmail', 'hotmail.co.uk', 'yahoo.co.jp', 'q.com', 'ptd.net', 'rocketmail.com', 'live.com', 'bellsouth.net', 'yahoo.de', 'hotmail.es', 'msn.com', 'cableone.net', 'live.com.mx', 'ymail.com', 'embarqmail.com', 'charter.net', 'suddenlink.net', 'yahoo.es', 'yahoo.com', 'juno.com', 'hotmail.fr', 'prodigy.net.mx', 'netzero.net', 'centurylink.net', 'cox.net', 'anonymous.com', 'att.net', 'hotmai

### Data encoding

In [9]:
from sklearn.preprocessing import LabelEncoder

label_cols = ['ProductCD', 'card4', 'card6', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
              'id_12', 'id_16', 'id_27', 'id_28', 'id_29', 'id_35', 'id_36', 'id_37', 'id_38',
              'DeviceType', 'id_23', 'id_34']

for col in label_cols:
    train[col] = train[col].fillna("Missing")  # Handle NaNs
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])


In [10]:
freq_cols = ['P_emaildomain', 'R_emaildomain', 'DeviceInfo']

for col in freq_cols:
    train[col] = train[col].fillna("Missing")
    train[col] = train[col].map(train[col].value_counts())  # Map frequency counts


In [11]:
one_hot_cols = ['id_15', 'M4', 'id_30', 'id_31']

train = pd.get_dummies(train, columns=one_hot_cols, drop_first=True)


In [12]:
train[['Screen_Width', 'Screen_Height']] = train['id_33'].str.split('x', expand=True).astype(float)
train = train.drop(columns=['id_33'])

In [13]:
for col in train.columns:
    if train[col].isna().sum() > 0:
        print(col, end=" ")

card2 card3 card5 addr1 addr2 dist1 dist2 D1 D2 D3 D4 D5 D6 D7 D8 D9 D10 D11 D12 D13 D14 D15 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59 V60 V61 V62 V63 V64 V65 V66 V67 V68 V69 V70 V71 V72 V73 V74 V75 V76 V77 V78 V79 V80 V81 V82 V83 V84 V85 V86 V87 V88 V89 V90 V91 V92 V93 V94 V95 V96 V97 V98 V99 V100 V101 V102 V103 V104 V105 V106 V107 V108 V109 V110 V111 V112 V113 V114 V115 V116 V117 V118 V119 V120 V121 V122 V123 V124 V125 V126 V127 V128 V129 V130 V131 V132 V133 V134 V135 V136 V137 V138 V139 V140 V141 V142 V143 V144 V145 V146 V147 V148 V149 V150 V151 V152 V153 V154 V155 V156 V157 V158 V159 V160 V161 V162 V163 V164 V165 V166 V167 V168 V169 V170 V171 V172 V173 V174 V175 V176 V177 V178 V179 V180 V181 V182 V183 V184 V185 V186 V187 V188 V189 V190 V191 V192 V193 V194 V195 V196 V197 V198 V199 V200 V201 V202 V203 

In [14]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31_samsung browser 5.4,id_31_samsung browser 6.2,id_31_samsung browser 6.4,id_31_samsung browser 7.0,id_31_samsung browser generic,id_31_seamonkey,id_31_silk,id_31_waterfox,Screen_Width,Screen_Height
0,2987000,0,86400,68.5,4,13926,,150.0,2,142.0,...,False,False,False,False,False,False,False,False,,
1,2987001,0,86401,29.0,4,2755,404.0,150.0,3,102.0,...,False,False,False,False,False,False,False,False,,
2,2987002,0,86469,59.0,4,4663,490.0,150.0,4,166.0,...,False,False,False,False,False,False,False,False,,
3,2987003,0,86499,50.0,4,18132,567.0,150.0,3,117.0,...,False,False,False,False,False,False,False,False,,
4,2987004,0,86506,50.0,1,4497,514.0,150.0,3,102.0,...,False,True,False,False,False,False,False,False,2220.0,1080.0


In [26]:
train = train.fillna(0) # fill in nulls with zeroes

In [None]:
# Drop columns that might be mostly NaN
drop_cols = [col for col in train.columns if train[col].isna().sum() > 0.9 * len(train)]
train.drop(columns=drop_cols, inplace=True)

# Fill remaining NaNs (simplistic approach)
train.fillna(-999, inplace=True)

# Label-encode some categorical features (example)
cat_cols = ["ProductCD", "card4", "DeviceType"]  # minimal example
for c in cat_cols:
    if c in train.columns:
        train[c] = train[c].astype(str)
        train[c] = LabelEncoder().fit_transform(train[c])

In [27]:
# Target is "isFraud"
X = train.drop(["isFraud", "TransactionID"], axis=1, errors="ignore")
y = train["isFraud"]

# Simple split (no cross-validation for brevity)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Data normalization

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']] = scaler.fit_transform(
    X_train[['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']]
)
X_test[['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']] = scaler.transform(
    X_test[['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']]
)

In [29]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train[['TransactionDT', 'id_01', 'id_02', 'id_03', 'id_07', 'id_08']] = scaler.fit_transform(
    X_train[['TransactionDT', 'id_01', 'id_02', 'id_03', 'id_07', 'id_08']]
)
X_test[['TransactionDT', 'id_01', 'id_02', 'id_03', 'id_07', 'id_08']] = scaler.transform(
    X_test[['TransactionDT', 'id_01', 'id_02', 'id_03', 'id_07', 'id_08']]
)

In [30]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_train[['TransactionAmt', 'card1', 'card2', 'card3', 'card5']] = scaler.fit_transform(
    X_train[['TransactionAmt', 'card1', 'card2', 'card3', 'card5']]
)
X_test[['TransactionAmt', 'card1', 'card2', 'card3', 'card5']] = scaler.transform(
    X_test[['TransactionAmt', 'card1', 'card2', 'card3', 'card5']]
)

In [31]:
import numpy as np

for col in ['TransactionAmt', 'addr1', 'addr2']:
    X_train[col] = np.log1p(X_train[col])
    X_test[col] = np.log1p(X_test[col])


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [33]:
X_train = X_train.fillna(0)

In [None]:
def run_genetic_algorithm(X_data, y_data):
    """
    Placeholder for GA. Return a subset of feature indices.
    """
    # TODO: Implement population, crossover, mutation, selection, etc.
    # For demonstration, return all features:
    return np.arange(X_data.shape[1])

best_features = run_genetic_algorithm(X_train, y_train)
X_train_ga = X_train.iloc[:, best_features]
X_test_ga = X_test.iloc[:, best_features]

In [None]:
model = SVC(kernel="rbf")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Baseline model Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
def run_pso_for_hyperparams(X_data, y_data, model_class):
    """
    Placeholder for PSO. Return best hyperparams found.
    """
    # TODO: Swarm initialization, velocity updates, fitness evaluation, etc.
    return {"C": 1.0, "kernel": "rbf"}  # example

best_params = run_pso_for_hyperparams(X_train_ga, y_train, SVC)
model_pso = SVC(**best_params)
model_pso.fit(X_train_ga, y_train)
y_pred_pso = model_pso.predict(X_test_ga)
print("PSO-tuned model Accuracy:", accuracy_score(y_test, y_pred_pso))

In [None]:
def run_aco_classification(X_data, y_data):
    """
    Placeholder for ACO. Return final classification model or rules.
    """
    # TODO: ACO logic with ants’ path optimization, pheromone updates, etc.
    # Here we simply return the same baseline model for demonstration:
    return SVC(kernel="rbf")

aco_model = run_aco_classification(X_train_ga, y_train)
aco_model.fit(X_train_ga, y_train)
y_pred_aco = aco_model.predict(X_test_ga)
print("ACO-based model Accuracy:", accuracy_score(y_test, y_pred_aco))