In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import scipy.stats as stats
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import random

first and foremost, merge two dbs into one

In [2]:
train_transaction = pd.read_csv("data/train_transaction.csv")
train_identity = pd.read_csv("data/train_identity.csv")

# Merge both dataframes on 'TransactionID'
train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")

print(f"Rows in merged training set: {train.shape[0]}")
print(f"Columns in merged training set: {train.shape[1]}")
train.head()



### Perform an initial exploratory data analysis (EDA) by checking missing value percentages and examining the target distribution.

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

identifying the target variable distribution and missing values for features

In [4]:
# Missing value percentages per column
missing_percent = (train.isnull().sum() / len(train)) * 100
missing_percent = missing_percent.sort_values(ascending=False)
print("Missing percentages per column:")
print(missing_percent[missing_percent > 0])

# Distribution of the target variable 'isFraud'
train['isFraud'].value_counts().plot(kind='bar')
plt.title("Distribution of Fraudulent vs Non-Fraudulent Transactions")
plt.xlabel("isFraud")
plt.ylabel("Count")
plt.show()



In [5]:
null_cols = [col for col in train.columns if train[col].isna().sum() > 0.9 * len(train)]
null_cols

In [6]:
missing_df = train.copy(deep=True)
for col in null_cols:
    missing_df["m_flag_"+col] = np.where(missing_df[col].isnull(), 1, 0)
    correlation = missing_df[["m_flag_"+col, 'isFraud']].corr()
    print(correlation)




In [7]:
categorical_features = train.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    print(col, len(set(train[col])), set(train[col]))



### Find truly categorical values

screen resolution values are true numerical values, while all other features are categorical in nature

In [8]:
train[['Screen_Width', 'Screen_Height']] = train['id_33'].str.split('x', expand=True).astype(float)
train = train.drop(columns=['id_33'])

In [9]:
cat_cols = train.select_dtypes(include=['object', 'category']).columns

# Identify candidate categorical features based on unique value counts
candidate_categorical = {}
# Set a threshold for maximum unique values
unique_threshold = 20

# Iterate over numeric columns to check unique value counts
for col in train.select_dtypes(include=['int64', 'float64']).columns:
    unique_vals = train[col].nunique()
    if unique_vals < unique_threshold and col != "isFraud":
        candidate_categorical[col] = unique_vals

# Print candidate categorical features
print("Candidate categorical features (numeric columns with few unique values):")
for col, count in candidate_categorical.items():
    print(f"{col}: {count} unique values")



### Imputing nulls

not using standard imputation:
1. placed zero values as indicator for missing values where feature values no zero values anywhere else
2. added binary "missing_..." column for other features with missing values

In [10]:
# Identify numeric and categorical columns
num_cols = train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = list(set(cat_cols).union(set(candidate_categorical.keys())))
num_cols = [col for col in num_cols if col not in cat_cols and col not in ("TransactionID", "isFraud")]

# Imputation for numeric columns using median strategy; add indicator if desired
num_imputer = SimpleImputer(strategy='constant', fill_value=0)
train[num_cols] = num_imputer.fit_transform(train[num_cols])

# Imputation for categorical columns using a constant value
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])

# Confirm that no missing values remain (or check overall missing count)
print("Total missing values after imputation:", train.isnull().sum().sum())



### Data encoding

In [11]:
X = train.drop(columns=["isFraud", "TransactionID"])
y = train['isFraud']

### Encoding categorical features

splitting train and test asap

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

using novel thing: WOE encoder to compensate for an enormous dimentionality for SVC

In [13]:
encoder_high = ce.WOEEncoder(cols=cat_cols)
X_train_encoded_cat = encoder_high.fit_transform(X_train[cat_cols], y_train)
X_test_encoded_cat = encoder_high.transform(X_test[cat_cols])

### Data normalization

Scale data to remove any disrepancies in SVC

In [14]:
scaler = StandardScaler()
X_train_scaled_num = scaler.fit_transform(X_train[num_cols])
X_test_scaled_num = scaler.transform(X_test[num_cols])

In [15]:
X_train_scaled = pd.DataFrame(np.hstack([X_train_encoded_cat.values, X_train_scaled_num]), columns=cat_cols + num_cols)
X_test_scaled = pd.DataFrame(np.hstack([X_test_encoded_cat.values, X_test_scaled_num]), columns=cat_cols + num_cols)

In [16]:
print("Encoded training set shape:", X_train_scaled.shape)
print("Encoded test set shape:", X_test_scaled.shape)



In [17]:
X_train_scaled.head()

### Save data

In [18]:
X_train_scaled.to_csv("data/preprocessed_train.csv", index=False)
X_test_scaled.to_csv("data/preprocessed_test.csv", index=False)

### Baseline SVC

model = SVC(kernel="rbf")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Baseline model Accuracy:", accuracy_score(y_test, y_pred))

### GA for input features subset selection

In [None]:
def run_genetic_algorithm(X_data, y_data, population_size=30, n_generations=5, subset_size=20):
    n_features = X_data.shape[1]
    
    # Initialize population - each individual is a sorted list of feature indices
    population = []
    for _ in range(population_size):
        subset = random.sample(range(n_features), subset_size)
        subset.sort()
        population.append(subset)
    
    def fitness(individual):
        # Train SVC with this subset and return average cross-validation accuracy
        X_subset = X_data.iloc[:, individual]
        model = SVC(kernel="rbf")
        scores = cross_val_score(model, X_subset, y_data, cv=3, scoring='accuracy')
        return scores.mean()
    
    # Crossover combines genes from both parents at a random cut point
    def crossover(p1, p2, subset_size):
        cut = random.randint(1, subset_size - 1)
        return p1[:cut] + p2[cut:]

    # Mutation replaces a random index in child if random.threshold is met
    def mutation(child, n_features, subset_size):
        if random.random() < 0.1:
            i = random.randrange(subset_size)
            child[i] = random.randrange(n_features)
        return child

    for _ in range(n_generations):
        # Evaluate fitness of population
        scored_population = [(fitness(ind), ind) for ind in population]
        scored_population.sort(key=lambda x: x[0], reverse=True)
        
        # Selection: truncation selection (pick top half as survivors)
        survivors = scored_population[: population_size // 2]
        
        # Then randomly select two parents (p1 & p2) from survivors for crossover + mutation
        new_pop = [s[1] for s in survivors]
        while len(new_pop) < population_size:
            p1 = random.choice(survivors)[1]
            p2 = random.choice(survivors)[1]
            child = crossover(p1, p2, subset_size)
            child = mutation(child, n_features, subset_size)
            
            child = list(set(child))  # remove duplicates if any
            while len(child) < subset_size:  # if duplicates reduced size
                child.append(random.randrange(n_features))
            child.sort()
            new_pop.append(child)
        population = new_pop
    
    best = max([(fitness(ind), ind) for ind in population], key=lambda x: x[0])[1]
    return best

best_features = run_genetic_algorithm(X_train, y_train)
X_train_ga = X_train.iloc[:, best_features]
X_test_ga = X_test.iloc[:, best_features]
model = SVC(kernel="rbf")
model.fit(X_train_ga, y_train)
y_pred = model.predict(X_test)
print("Model accuracy with selected features:", accuracy_score(y_test, y_pred))

### PSO and ACO for SVC hyperparameter tuning

In [None]:
def run_pso_for_hyperparams(X_data, y_data, model_class, n_particles=10, n_iterations=5):
    # Define parameter search space for SVC (C and gamma)
    c_min, c_max = 0.01, 10.0
    gamma_min, gamma_max = 1e-4, 1.0

    # Initialize particles (positions) and velocities randomly
    positions = [
        [random.uniform(c_min, c_max), random.uniform(gamma_min, gamma_max)]
        for _ in range(n_particles)
    ]
    velocities = [[0.0, 0.0] for _ in range(n_particles)]

    # Personal bests (pbest) and global best (gbest)
    pbest_positions = positions[:]
    pbest_scores = [0.0]*n_particles
    gbest_position = None
    gbest_score = 0.0

    def fitness(pos):
        candidate = model_class(C=pos[0], gamma=pos[1], kernel='rbf')
        scores = cross_val_score(candidate, X_data, y_data, cv=3, scoring='accuracy')
        return scores.mean()

    # Evaluate initial fitness
    for i in range(n_particles):
        current_score = fitness(positions[i])
        pbest_scores[i] = current_score
        if current_score > gbest_score:
            gbest_score = current_score
            gbest_position = positions[i][:]

    # Main PSO loop
    w = 0.5  # inertia
    c1 = 1.0 # cognitive
    c2 = 1.0 # social
    for _ in range(n_iterations):
        for i in range(n_particles):
            # Velocity update
            r1, r2 = random.random(), random.random()
            velocities[i][0] = (
                w*velocities[i][0]
                + c1*r1*(pbest_positions[i][0] - positions[i][0])
                + c2*r2*(gbest_position[0] - positions[i][0])
            )
            velocities[i][1] = (
                w*velocities[i][1]
                + c1*r1*(pbest_positions[i][1] - positions[i][1])
                + c2*r2*(gbest_position[1] - positions[i][1])
            )
            
            # Position update and boundary check
            positions[i][0] += velocities[i][0]
            positions[i][1] += velocities[i][1]
            positions[i][0] = max(c_min, min(c_max, positions[i][0]))
            positions[i][1] = max(gamma_min, min(gamma_max, positions[i][1]))

            # Evaluate fitness
            current_score = fitness(positions[i])
            if current_score > pbest_scores[i]:
                pbest_scores[i] = current_score
                pbest_positions[i] = positions[i][:]
                if current_score > gbest_score:
                    gbest_score = current_score
                    gbest_position = positions[i][:]

    return {"C": gbest_position[0], "gamma": gbest_position[1], "kernel": "rbf"}

best_params = run_pso_for_hyperparams(X_train_ga, y_train, SVC)
model_pso = SVC(**best_params)
model_pso.fit(X_train_ga, y_train)
y_pred_pso = model_pso.predict(X_test_ga)
print("PSO-tuned model Accuracy:", accuracy_score(y_test, y_pred_pso))

In [None]:
def run_aco_for_hyperparams(X_data, y_data, model_class, n_ants=10, n_iterations=5):
    # Define parameter search space for SVC (C and gamma)
    c_min, c_max = 0.01, 10.0
    gamma_min, gamma_max = 1e-4, 1.0

    # Initialize pheromone levels for discrete “bins” in each dimension
    n_bins = 10
    pheromones = [[1.0 for _ in range(n_bins)] for _ in range(n_bins)]

    def bin_to_value(bin_index, min_val, max_val):
        step_size = (max_val - min_val) / n_bins
        return min_val + bin_index * step_size + (step_size / 2)

    def fitness(c_val, gamma_val):
        candidate = model_class(C=c_val, gamma=gamma_val, kernel='rbf')
        scores = cross_val_score(candidate, X_data, y_data, cv=3, scoring='accuracy')
        return scores.mean()

    best_score = 0.0
    best_params = (1.0, 1e-3)

    for _ in range(n_iterations):
        solutions = []
        for _ant in range(n_ants):
            # Select bins for C, gamma by roulette wheel sampling of pheromones
            c_bin = random.choices(range(n_bins), weights=pheromones[0], k=1)[0]
            g_bin = random.choices(range(n_bins), weights=pheromones[1], k=1)[0]
            c_val = bin_to_value(c_bin, c_min, c_max)
            gamma_val = bin_to_value(g_bin, gamma_min, gamma_max)
            
            score = fitness(c_val, gamma_val)
            solutions.append((c_bin, g_bin, score))

        # Update pheromones
        # Evaporate
        for i in range(n_bins):
            for j in range(len(pheromones)):
                pheromones[j][i] *= 0.9

        # Deposit
        for (c_bin, g_bin, score) in solutions:
            if score > best_score:
                best_score = score
                best_params = (bin_to_value(c_bin, c_min, c_max),
                               bin_to_value(g_bin, gamma_min, gamma_max))
            pheromones[0][c_bin] += score
            pheromones[1][g_bin] += score

    return {"C": best_params[0], "gamma": best_params[1], "kernel": "rbf"}

best_params_aco = run_aco_for_hyperparams(X_train_ga, y_train, SVC)
model_aco = SVC(**best_params_aco)
model_aco.fit(X_train_ga, y_train)
y_pred_aco = model_aco.predict(X_test_ga)
print("ACO-tuned model Accuracy:", accuracy_score(y_test, y_pred_aco))

### PSO and ACO for SVC loss function convergence speedup

In [None]:
# hell no we're not doing this