In [10]:
# Basic data manipulation
import pandas as pd
import numpy as np
import random

# Visualization
import matplotlib.pyplot as plt

# Sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer

# Classifiers
from catboost import CatBoostClassifier

# Additional encoders
import category_encoders as ce

# Stats
import scipy.stats as stats

# Dimensionality reduction
from sklearn.decomposition import (
    PCA,
    KernelPCA, 
    FastICA,
    TruncatedSVD
)

### Download all Data

In [None]:
test = pd.read_csv("data/test_for_model_eval.csv")
transaction_ids = test["TransactionID"].astype(int)
test = test.drop(columns=["TransactionID"])

In [17]:
X_train_scaled = pd.read_csv("data/preprocessed_train.csv")
X_test_scaled = pd.read_csv("data/preprocessed_test.csv")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")

### Get weights

In [18]:
# Convert y_train to numpy array (since it's read as DataFrame)
y_train_array = y_train['isFraud'].values

# Get unique classes and compute weights
unique_classes = np.unique(y_train_array)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=unique_classes,
    y=y_train_array
)

# Create dictionary of class weights
class_weights_dict = dict(zip(unique_classes, class_weights))
class_weights_dict = {int(k): float(v) for k, v in class_weights_dict.items()}

### 1. All Features

In [19]:
y_train_flat = y_train.values.ravel()
y_test_flat = y_test.values.ravel()

model_cat = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    class_weights=class_weights_dict,
    random_seed=42,
    verbose=100
)
model_cat.fit(X_train_scaled, y_train_flat)
y_pred_cat = model_cat.predict(X_test_scaled)

print("CatBoost Training Accuracy:", accuracy_score(y_test_flat, y_pred_cat))

0:	learn: 0.6495989	total: 148ms	remaining: 1m 13s
100:	learn: 0.3757994	total: 14.5s	remaining: 57.5s
200:	learn: 0.3300922	total: 28.7s	remaining: 42.7s
300:	learn: 0.2994735	total: 43.1s	remaining: 28.5s
400:	learn: 0.2770163	total: 59.4s	remaining: 14.7s
499:	learn: 0.2581595	total: 1m 15s	remaining: 0us
CatBoost Training Accuracy: 0.916229213939784


In [62]:
y_pred_test = model_cat.predict_proba(test)
fraud_probs = y_pred_test[:, 1]

In [64]:
submission_df = pd.DataFrame({
    'TransactionID': transaction_ids,
    'isFraud': fraud_probs
})

In [65]:
submission_df.to_csv('submissions/all_features.csv', index=False)

#### On test set: 0.889037  &   0.922709

### 2. PCA

In [None]:
X_train_scaled = pd.read_csv("data/preprocessed_train.csv")
X_test_scaled = pd.read_csv("data/preprocessed_test.csv")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")


test = pd.read_csv("data/test_for_model_eval.csv")
transaction_ids = test["TransactionID"].astype(int)
test = test.drop(columns=["TransactionID"])


pca = PCA(n_components=30)
X_train_scaled = pca.fit_transform(X_train_scaled)
X_test_scaled = pca.transform(X_test_scaled)


test = pca.transform(test)


y_train_flat = y_train.values.ravel()
y_test_flat = y_test.values.ravel()

model_cat = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    class_weights=class_weights_dict,
    random_seed=42,
    verbose=100
)
model_cat.fit(X_train_scaled, y_train_flat)
y_pred_cat = model_cat.predict(X_test_scaled)

print("CatBoost Training Accuracy:", accuracy_score(y_test_flat, y_pred_cat))


y_pred_test = model_cat.predict_proba(test)
fraud_probs = y_pred_test[:, 1]


submission_df = pd.DataFrame({
    'TransactionID': transaction_ids,
    'isFraud': fraud_probs
})

submission_df.to_csv('submissions/pca.csv', index=False)

#### On test set: 0.821966  &  0.872178

### 3. ICA

In [76]:
X_train_scaled = pd.read_csv("data/preprocessed_train.csv")
X_test_scaled = pd.read_csv("data/preprocessed_test.csv")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")


test = pd.read_csv("data/test_for_model_eval.csv")
transaction_ids = test["TransactionID"].astype(int)
test = test.drop(columns=["TransactionID"])


ica = FastICA(n_components=30, random_state=42)
X_train_scaled = ica.fit_transform(X_train_scaled)
X_test_scaled = ica.transform(X_test_scaled)


test = ica.transform(test)


y_train_flat = y_train.values.ravel()
y_test_flat = y_test.values.ravel()

model_cat = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    class_weights=class_weights_dict,
    random_seed=42,
    verbose=100
)
model_cat.fit(X_train_scaled, y_train_flat)
y_pred_cat = model_cat.predict(X_test_scaled)

print("CatBoost Training Accuracy:", accuracy_score(y_test_flat, y_pred_cat))


y_pred_test = model_cat.predict_proba(test)
fraud_probs = y_pred_test[:, 1]


submission_df = pd.DataFrame({
    'TransactionID': transaction_ids,
    'isFraud': fraud_probs
})

submission_df.to_csv('submissions/ica.csv', index=False)

0:	learn: 0.6618624	total: 30.9ms	remaining: 15.4s
100:	learn: 0.4496866	total: 2.35s	remaining: 9.29s
200:	learn: 0.4186339	total: 4.91s	remaining: 7.3s
300:	learn: 0.3950734	total: 7.32s	remaining: 4.84s
400:	learn: 0.3757994	total: 9.59s	remaining: 2.37s
499:	learn: 0.3596274	total: 11.7s	remaining: 0us
CatBoost Training Accuracy: 0.8600179496731805


#### On test set: 0.818411  &  0.875235

### 4. SVD

In [77]:
X_train_scaled = pd.read_csv("data/preprocessed_train.csv")
X_test_scaled = pd.read_csv("data/preprocessed_test.csv")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")


test = pd.read_csv("data/test_for_model_eval.csv")
transaction_ids = test["TransactionID"].astype(int)
test = test.drop(columns=["TransactionID"])


svd = TruncatedSVD(n_components=30, random_state=42)
X_train_scaled = svd.fit_transform(X_train_scaled)
X_test_scaled = svd.transform(X_test_scaled)


test = ica.transform(test)


y_train_flat = y_train.values.ravel()
y_test_flat = y_test.values.ravel()

model_cat = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    class_weights=class_weights_dict,
    random_seed=42,
    verbose=100
)
model_cat.fit(X_train_scaled, y_train_flat)
y_pred_cat = model_cat.predict(X_test_scaled)

print("CatBoost Training Accuracy:", accuracy_score(y_test_flat, y_pred_cat))


y_pred_test = model_cat.predict_proba(test)
fraud_probs = y_pred_test[:, 1]


submission_df = pd.DataFrame({
    'TransactionID': transaction_ids,
    'isFraud': fraud_probs
})

submission_df.to_csv('submissions/svd.csv', index=False)

0:	learn: 0.6564137	total: 35.6ms	remaining: 17.8s
100:	learn: 0.4548501	total: 2.06s	remaining: 8.14s
200:	learn: 0.4276080	total: 4.39s	remaining: 6.54s
300:	learn: 0.4069344	total: 6.88s	remaining: 4.54s
400:	learn: 0.3904750	total: 9.32s	remaining: 2.3s
499:	learn: 0.3764382	total: 11.7s	remaining: 0us
CatBoost Training Accuracy: 0.8502387645206082


#### On test set: 0.446137  &  0.452396

### 5. GA

In [78]:
X_train_scaled = pd.read_csv("data/preprocessed_train.csv")
X_test_scaled = pd.read_csv("data/preprocessed_test.csv")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")


test = pd.read_csv("data/test_for_model_eval.csv")
transaction_ids = test["TransactionID"].astype(int)
test = test.drop(columns=["TransactionID"])

In [79]:
def run_genetic_algorithm(X_data, y_data, population_size=30, n_generations=20, subset_size=30):
    n_features = X_data.shape[1]
    
    # Initialize population - each individual is a sorted list of feature indices
    population = []
    for _ in range(population_size):
        subset = random.sample(range(n_features), subset_size)
        subset.sort()
        population.append(subset)
    
    def fitness(individual):
        X_subset = X_data.iloc[:, individual]
        
        # Manual train/test split instead of cross-validation
        X_train_subset, X_val_subset, y_train_subset, y_val_subset = train_test_split(
            X_subset, y_data, test_size=0.2, random_state=42, stratify=y_data
        )
        
        try:
            model = CatBoostClassifier(
                iterations=100,  # Reduce from 500 to speed up GA
                learning_rate=0.1,
                depth=6,
                class_weights=class_weights_dict,
                random_seed=42,
                verbose=0        # Turn off verbosity completely            
            )
            
            # Use a simple fit instead of cross_val_score
            model.fit(X_train_subset, y_train_subset, 
                     eval_set=(X_val_subset, y_val_subset),
                     early_stopping_rounds=20,
                     verbose=False)
            
            # Get validation accuracy
            accuracy = accuracy_score(y_val_subset, model.predict(X_val_subset))
            print(f"Feature subset evaluated: accuracy = {accuracy:.4f}")
            return accuracy
            
        except Exception as e:
            print(f"Error in fitness evaluation: {e}")
            return 0.0  # Return worst fitness on error
    
    # Creates a child by merging features from both parents and selecting a random subset
    def crossover(p1, p2, subset_size):
        combined = list(set(p1) | set(p2))  # Union of features
        if len(combined) > subset_size:
            child = sorted(random.sample(combined, subset_size))  # Ensure correct size
        else:
            child = sorted(combined)  # Keep all if below subset_size
        return child

    # Mutation replaces a random index in child if random.threshold is met
    def mutation(individual, n_features, subset_size):
        if random.random() < 0.1:  # 10% chance of mutation
            i = random.randrange(subset_size)
            available_features = set(range(n_features)) - set(individual)  # Exclude existing features
            if available_features:  
                new_feature = random.choice(list(available_features))
                individual[i] = new_feature
                individual.sort()
        return individual


    for i in range(n_generations):
        print(f"\nGeneration {i+1}/{n_generations}")
        # Evaluate fitness of population
        print("Evaluating fitness for each individual:")
        scored_population = []
        for idx, ind in enumerate(population):
            fitness_score = fitness(ind)
            scored_population.append((fitness_score, ind))
            print(f"Individual {idx+1}/{len(population)}: Fitness = {fitness_score:.4f}")
        
        scored_population.sort(key=lambda x: x[0], reverse=True)
        print(f"\nBest fitness in generation {i+1}: {scored_population[0][0]:.4f}")
        
        # Selection: truncation selection (pick top half as survivors)
        survivors = scored_population[: population_size // 2]
        
        # Then randomly select two parents (p1 & p2) from survivors for crossover + mutation
        print("Creating new population...")
        new_pop = [s[1] for s in survivors]
        while len(new_pop) < population_size:
            print("Generating new individual...")
            
            p1 = random.choice(survivors)[1]
            p2 = random.choice(survivors)[1]
            child = crossover(p1, p2, subset_size)
            child = mutation(child, n_features, subset_size)
            
            child = list(set(child))  # remove duplicates if any
            while len(child) < subset_size:  # if duplicates reduced size
                child.append(random.randrange(n_features))
            child.sort()
            new_pop.append(child)
            
            print("New individual created! Happy birthday!")
        population = new_pop
        print("Current best:", max([(fitness(ind), ind) for ind in population], key=lambda x: x[0])[1])
    
    best = max([(fitness(ind), ind) for ind in population], key=lambda x: x[0])[1]
    return best

best_features = run_genetic_algorithm(X_train_scaled, y_train)
X_train_ga = X_train_scaled.iloc[:, best_features]
X_test_ga = X_test_scaled.iloc[:, best_features]
model = CatBoostClassifier(
            iterations=500,
            learning_rate=0.1,
            depth=6,
            class_weights=class_weights_dict,
            random_seed=42,
            verbose=100
        )
model.fit(X_train_ga, y_train)
y_pred = model.predict(X_test_ga)
print("Model accuracy with selected features:", accuracy_score(y_test, y_pred))


Generation 1/20
Evaluating fitness for each individual:
Feature subset evaluated: accuracy = 0.8083
Individual 1/30: Fitness = 0.8083
Feature subset evaluated: accuracy = 0.8066
Individual 2/30: Fitness = 0.8066
Feature subset evaluated: accuracy = 0.8113
Individual 3/30: Fitness = 0.8113
Feature subset evaluated: accuracy = 0.8134
Individual 4/30: Fitness = 0.8134
Feature subset evaluated: accuracy = 0.8003
Individual 5/30: Fitness = 0.8003
Feature subset evaluated: accuracy = 0.8111
Individual 6/30: Fitness = 0.8111
Feature subset evaluated: accuracy = 0.7971
Individual 7/30: Fitness = 0.7971
Feature subset evaluated: accuracy = 0.7904
Individual 8/30: Fitness = 0.7904
Feature subset evaluated: accuracy = 0.8292
Individual 9/30: Fitness = 0.8292
Feature subset evaluated: accuracy = 0.7861
Individual 10/30: Fitness = 0.7861
Feature subset evaluated: accuracy = 0.7827
Individual 11/30: Fitness = 0.7827
Feature subset evaluated: accuracy = 0.8191
Individual 12/30: Fitness = 0.8191
Feat

In [81]:
X_test_ga = test.iloc[:, best_features]
y_pred_test = model.predict_proba(X_test_ga)
fraud_probs = y_pred_test[:, 1]


submission_df = pd.DataFrame({
    'TransactionID': transaction_ids,
    'isFraud': fraud_probs
})

submission_df.to_csv('submissions/ga.csv', index=False)