In [1]:
# Basic data manipulation
import pandas as pd
import numpy as np
import random

# Visualization
import matplotlib.pyplot as plt

# Sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer

# Additional encoders
import category_encoders as ce

# Stats
import scipy.stats as stats

# Dimensionality reduction
from sklearn.decomposition import (
    PCA,
    KernelPCA, 
    FastICA,
    TruncatedSVD
)

### First and foremost, merge two dbs into one

In [None]:
train_transaction = pd.read_csv("data/train_transaction.csv")
train_identity = pd.read_csv("data/train_identity.csv")

# Merge both dataframes on 'TransactionID'
train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")

print(f"Rows in merged training set: {train.shape[0]}")
print(f"Columns in merged training set: {train.shape[1]}")
train.head()

### Perform an initial exploratory data analysis (EDA) by checking missing value percentages and examining the target distribution.

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Identifying the target variable distribution and missing values for features

In [None]:
# Missing value percentages per column
missing_percent = (train.isnull().sum() / len(train)) * 100
missing_percent = missing_percent.sort_values(ascending=False)
print("Missing percentages per column:")
print(missing_percent[missing_percent > 0])

# Distribution of the target variable 'isFraud'
train['isFraud'].value_counts().plot(kind='bar')
plt.title("Distribution of Fraudulent vs Non-Fraudulent Transactions")
plt.xlabel("isFraud")
plt.ylabel("Count")
plt.show()

In [None]:
null_cols = [col for col in train.columns if train[col].isna().sum() > 0.9 * len(train)]
null_cols

In [None]:
missing_df = train.copy(deep=True)
for col in null_cols:
    missing_df["m_flag_"+col] = np.where(missing_df[col].isnull(), 1, 0)
    correlation = missing_df[["m_flag_"+col, 'isFraud']].corr()
    print(correlation)


In [None]:
categorical_features = train.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    print(col, len(set(train[col])), set(train[col]))

### Find truly categorical values

Screen resolution values are true numerical values, while all other features are categorical in nature

In [None]:
train[['Screen_Width', 'Screen_Height']] = train['id_33'].str.split('x', expand=True).astype(float)
train = train.drop(columns=['id_33'])

In [None]:
cat_cols = train.select_dtypes(include=['object', 'category']).columns

# Identify candidate categorical features based on unique value counts
candidate_categorical = {}
# Set a threshold for maximum unique values
unique_threshold = 20

# Iterate over numeric columns to check unique value counts
for col in train.select_dtypes(include=['int64', 'float64']).columns:
    unique_vals = train[col].nunique()
    if unique_vals < unique_threshold and col != "isFraud":
        candidate_categorical[col] = unique_vals

# Print candidate categorical features
print("Candidate categorical features (numeric columns with few unique values):")
for col, count in candidate_categorical.items():
    print(f"{col}: {count} unique values")

### Imputing nulls

Not using standard imputation:
1. Placed zero values as indicator for missing values where feature values no zero values anywhere else
2. Added 'missing' instead of null for categorical values to keep all the columns

In [None]:
# Identify numeric and categorical columns
num_cols = train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = list(set(cat_cols).union(set(candidate_categorical.keys())))
num_cols = [col for col in num_cols if col not in cat_cols and col not in ("TransactionID", "isFraud")]

# Imputation for numeric columns using zeros as indicator
num_imputer = SimpleImputer(strategy='constant', fill_value=0)
train[num_cols] = num_imputer.fit_transform(train[num_cols])

# Imputation for categorical columns using a constant value
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])

# Confirm that no missing values remain (or check overall missing count)
print("Total missing values after imputation:", train.isnull().sum().sum())

### Data encoding

In [None]:
X = train.drop(columns=["isFraud", "TransactionID"])
y = train['isFraud']

### Encoding categorical features

Splitting train and test asap

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Using novel thing: WOE encoder to compensate for an enormous dimentionality for SVC

In [None]:
encoder_high = ce.WOEEncoder(cols=cat_cols)
X_train_encoded_cat = encoder_high.fit_transform(X_train[cat_cols], y_train)
X_test_encoded_cat = encoder_high.transform(X_test[cat_cols])

### Data normalization

Scale data to remove any disrepancies in SVC

In [None]:
scaler = StandardScaler()
X_train_scaled_num = scaler.fit_transform(X_train[num_cols])
X_test_scaled_num = scaler.transform(X_test[num_cols])

In [None]:
X_train_scaled = pd.DataFrame(np.hstack([X_train_encoded_cat.values, X_train_scaled_num]), columns=cat_cols + num_cols)
X_test_scaled = pd.DataFrame(np.hstack([X_test_encoded_cat.values, X_test_scaled_num]), columns=cat_cols + num_cols)

In [None]:
print("Encoded training set shape:", X_train_scaled.shape)
print("Encoded test set shape:", X_test_scaled.shape)

In [None]:
X_train_scaled.head()

### Save data

In [None]:
X_train_scaled.to_csv("data/preprocessed_train.csv", index=False)
X_test_scaled.to_csv("data/preprocessed_test.csv", index=False)

y_train.to_csv("data/y_train.csv", index=False)
y_test.to_csv("data/y_test.csv", index=False)

### Baseline catboost

Download processed data

In [2]:
X_train_scaled = pd.read_csv("data/preprocessed_train.csv")
X_test_scaled = pd.read_csv("data/preprocessed_test.csv")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")

accounting for severe class imbalance

In [3]:
# Convert y_train to numpy array (since it's read as DataFrame)
y_train_array = y_train['isFraud'].values

# Get unique classes and compute weights
unique_classes = np.unique(y_train_array)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=unique_classes,
    y=y_train_array
)

# Create dictionary of class weights
class_weights_dict = dict(zip(unique_classes, class_weights))
class_weights_dict = {int(k): float(v) for k, v in class_weights_dict.items()}

PCA

In [4]:
pca = PCA(n_components=30)
X_train_scaled = pca.fit_transform(X_train_scaled)
X_test_scaled = pca.transform(X_test_scaled)

Kernel PCA

In [None]:
kpca = KernelPCA(n_components=30, kernel='rbf', random_state=42)
X_train_scaled = kpca.fit_transform(X_train_scaled)
X_test_scaled = kpca.transform(X_test_scaled)

Independent Component Analysis (ICA)

In [None]:
ica = FastICA(n_components=30, random_state=42)
X_train_scaled = ica.fit_transform(X_train_scaled)
X_test_scaled = ica.transform(X_test_scaled)

Truncated SVD

In [None]:
svd = TruncatedSVD(n_components=30, random_state=42)
X_train_scaled = svd.fit_transform(X_train_scaled)
X_test_scaled = svd.transform(X_test_scaled)

Train baseline model

In [None]:
from catboost import CatBoostClassifier

y_train_flat = y_train.values.ravel()
y_test_flat = y_test.values.ravel()

model_cat = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    class_weights=class_weights_dict,
    random_seed=42,
    verbose=100
)
model_cat.fit(X_train_scaled, y_train_flat)
y_pred_cat = model_cat.predict(X_test_scaled)
print("CatBoost Training Accuracy:", accuracy_score(y_test_flat, y_pred_cat))

0:	learn: 0.6556789	total: 114ms	remaining: 56.9s
100:	learn: 0.4566328	total: 3.24s	remaining: 12.8s
200:	learn: 0.4297046	total: 5.49s	remaining: 8.17s
300:	learn: 0.4090670	total: 7.71s	remaining: 5.1s
400:	learn: 0.3936705	total: 9.92s	remaining: 2.45s
499:	learn: 0.3800877	total: 12.1s	remaining: 0us
CatBoost Accuracy: 0.8490026077827073


### GA for input features subset selection

In [None]:
def run_genetic_algorithm(X_data, y_data, population_size=30, n_generations=5, subset_size=20):
    n_features = X_data.shape[1]
    
    # Initialize population - each individual is a sorted list of feature indices
    population = []
    for _ in range(population_size):
        subset = random.sample(range(n_features), subset_size)
        subset.sort()
        population.append(subset)
    
    def fitness(individual):
        # Train SVC with this subset and return average cross-validation accuracy
        X_subset = X_data.iloc[:, individual]
        model = SVC(kernel="rbf")
        scores = cross_val_score(model, X_subset, y_data, cv=3, scoring='accuracy')
        return scores.mean()
    
    # Crossover combines genes from both parents at a random cut point
    def crossover(p1, p2, subset_size):
        cut = random.randint(1, subset_size - 1)
        return p1[:cut] + p2[cut:]

    # Mutation replaces a random index in child if random.threshold is met
    def mutation(child, n_features, subset_size):
        if random.random() < 0.1:
            i = random.randrange(subset_size)
            child[i] = random.randrange(n_features)
        return child

    for _ in range(n_generations):
        # Evaluate fitness of population
        scored_population = [(fitness(ind), ind) for ind in population]
        scored_population.sort(key=lambda x: x[0], reverse=True)
        
        # Selection: truncation selection (pick top half as survivors)
        survivors = scored_population[: population_size // 2]
        
        # Then randomly select two parents (p1 & p2) from survivors for crossover + mutation
        new_pop = [s[1] for s in survivors]
        while len(new_pop) < population_size:
            p1 = random.choice(survivors)[1]
            p2 = random.choice(survivors)[1]
            child = crossover(p1, p2, subset_size)
            child = mutation(child, n_features, subset_size)
            
            child = list(set(child))  # remove duplicates if any
            while len(child) < subset_size:  # if duplicates reduced size
                child.append(random.randrange(n_features))
            child.sort()
            new_pop.append(child)
        population = new_pop
    
    best = max([(fitness(ind), ind) for ind in population], key=lambda x: x[0])[1]
    return best

best_features = run_genetic_algorithm(X_train, y_train)
X_train_ga = X_train.iloc[:, best_features]
X_test_ga = X_test.iloc[:, best_features]
model = SVC(kernel="rbf")
model.fit(X_train_ga, y_train)
y_pred = model.predict(X_test)
print("Model accuracy with selected features:", accuracy_score(y_test, y_pred))

### PSO and ACO for catboost hyperparameter tuning

In [None]:
def run_pso_for_catboost_hyperparams(X_data, y_data, n_particles=10, n_iterations=5):
    # Define parameter search spaces
    param_bounds = {
        'learning_rate': (0.01, 0.3),
        'depth': (3, 10),
        'l2_leaf_reg': (1.0, 10.0),
        'iterations': (100, 1000)
    }
    
    # Initialize particles
    positions = []
    for _ in range(n_particles):
        pos = {
            'learning_rate': random.uniform(*param_bounds['learning_rate']),
            'depth': int(random.uniform(*param_bounds['depth'])),
            'l2_leaf_reg': random.uniform(*param_bounds['l2_leaf_reg']),
            'iterations': int(random.uniform(*param_bounds['iterations']))
        }
        positions.append(pos)
    
    velocities = [{k: 0.0 for k in param_bounds.keys()} for _ in range(n_particles)]
    
    # Personal and global bests
    pbest_positions = positions[:]
    pbest_scores = [0.0] * n_particles
    gbest_position = None
    gbest_score = 0.0
    
    def fitness(params):
        model = CatBoostClassifier(
            **params,
            class_weights=class_weights_dict,
            random_seed=42,
            verbose=False
        )
        scores = cross_val_score(model, X_data, y_data, cv=3, scoring='accuracy')
        return scores.mean()
    
    # Evaluate initial fitness
    for i in range(n_particles):
        score = fitness(positions[i])
        pbest_scores[i] = score
        if score > gbest_score:
            gbest_score = score
            gbest_position = positions[i].copy()
    
    # Main PSO loop
    w, c1, c2 = 0.5, 1.0, 1.0
    for _ in range(n_iterations):
        for i in range(n_particles):
            r1, r2 = random.random(), random.random()
            
            # Update velocities and positions for each parameter
            for param in param_bounds.keys():
                velocities[i][param] = (
                    w * velocities[i][param] +
                    c1 * r1 * (pbest_positions[i][param] - positions[i][param]) +
                    c2 * r2 * (gbest_position[param] - positions[i][param])
                )
                
                positions[i][param] += velocities[i][param]
                
                # Ensure bounds and proper types
                if param == 'depth' or param == 'iterations':
                    positions[i][param] = int(max(param_bounds[param][0],
                                               min(param_bounds[param][1],
                                                   positions[i][param])))
                else:
                    positions[i][param] = max(param_bounds[param][0],
                                           min(param_bounds[param][1],
                                               positions[i][param]))
            
            # Evaluate fitness
            score = fitness(positions[i])
            if score > pbest_scores[i]:
                pbest_scores[i] = score
                pbest_positions[i] = positions[i].copy()
                if score > gbest_score:
                    gbest_score = score
                    gbest_position = positions[i].copy()
    
    return gbest_position

In [None]:
def run_aco_for_catboost_hyperparams(X_data, y_data, n_ants=10, n_iterations=5):
    # Define parameter spaces and bins
    n_bins = 10
    param_bounds = {
        'learning_rate': (0.01, 0.3),
        'depth': (3, 10),
        'l2_leaf_reg': (1.0, 10.0),
        'iterations': (100, 1000)
    }
    
    # Initialize pheromone levels for each parameter
    pheromones = {param: [1.0] * n_bins for param in param_bounds.keys()}
    
    def bin_to_value(bin_index, param):
        min_val, max_val = param_bounds[param]
        step_size = (max_val - min_val) / n_bins
        val = min_val + bin_index * step_size + (step_size / 2)
        return int(val) if param in ['depth', 'iterations'] else val
    
    def fitness(params):
        model = CatBoostClassifier(
            **params,
            class_weights=class_weights_dict,
            random_seed=42,
            verbose=False
        )
        scores = cross_val_score(model, X_data, y_data, cv=3, scoring='accuracy')
        return scores.mean()
    
    best_score = 0.0
    best_params = None
    
    for _ in range(n_iterations):
        solutions = []
        
        # Generate solutions for each ant
        for _ in range(n_ants):
            current_params = {}
            for param in param_bounds.keys():
                bin_idx = random.choices(range(n_bins), 
                                      weights=pheromones[param], 
                                      k=1)[0]
                current_params[param] = bin_to_value(bin_idx, param)
            
            score = fitness(current_params)
            solutions.append((current_params, score))
        
        # Evaporate pheromones
        for param in pheromones:
            for i in range(n_bins):
                pheromones[param][i] *= 0.9
        
        # Update pheromones based on solutions
        for params, score in solutions:
            if score > best_score:
                best_score = score
                best_params = params.copy()
            
            # Deposit pheromones
            for param, value in params.items():
                bin_idx = int((value - param_bounds[param][0]) / 
                            (param_bounds[param][1] - param_bounds[param][0]) * n_bins)
                bin_idx = min(bin_idx, n_bins - 1)
                pheromones[param][bin_idx] += score
    
    return best_params

Example usage

In [None]:
best_params_pso = run_pso_for_catboost_hyperparams(X_train_scaled, y_train_flat)
model_cat_pso = CatBoostClassifier(**best_params_pso, class_weights=class_weights_dict, random_seed=42)
model_cat_pso.fit(X_train_scaled, y_train_flat)
y_pred_cat_pso = model_cat_pso.predict(X_test_scaled)
print("PSO-tuned CatBoost Accuracy:", accuracy_score(y_test_flat, y_pred_cat_pso))

best_params_aco = run_aco_for_catboost_hyperparams(X_train_scaled, y_train_flat)
model_cat_aco = CatBoostClassifier(**best_params_aco, class_weights=class_weights_dict, random_seed=42)
model_cat_aco.fit(X_train_scaled, y_train_flat)
y_pred_cat_aco = model_cat_aco.predict(X_test_scaled)
print("ACO-tuned CatBoost Accuracy:", accuracy_score(y_test_flat, y_pred_cat_aco))