In [47]:
import pandas as pd
import numpy as np
import json

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_parquet('full_clash_battles_zstd.parquet').reset_index(drop=True)

In [31]:
keep, swap = train_test_split(df, test_size=0.5, random_state=42)
keep['Win'] = 1
swap['Win'] = 0
keep = keep[[col for col in keep.columns if '_id' in col or col == 'Win']]
swap = swap[[col for col in swap.columns if '_id' in col or col == 'Win']]
swap = swap[[f'loser_card_{i}_id' for i in range(1, 9)] + ['loser_tower_card_id'] + [f'winner_card_{i}_id' for i in range(1, 9)] + ['winner_tower_card_id'] + ['Win']]
stacked_data = np.vstack([keep.values, swap.values])
stacked = pd.DataFrame(stacked_data, columns = keep.columns).rename(columns=lambda x: x.replace('winner', 'player1')).rename(columns=lambda x: x.replace('loser', 'player2'))

In [34]:
with open('./dicts/card_mappings.json') as f:
    card_mappings = json.load(f)

for col in stacked.columns:
    if col.endswith('_id'):
        stacked[col] = stacked[col].astype(str).map(card_mappings)
stacked.head()

Unnamed: 0,player1_card_1_id,player1_card_2_id,player1_card_3_id,player1_card_4_id,player1_card_5_id,player1_card_6_id,player1_card_7_id,player1_card_8_id,player1_tower_card_id,player2_card_1_id,player2_card_2_id,player2_card_3_id,player2_card_4_id,player2_card_5_id,player2_card_6_id,player2_card_7_id,player2_card_8_id,player2_tower_card_id,Win
0,Mega Knight,Golem,Elite Barbarians,Goblin Barrel,Inferno Tower,Rocket,Poison,Balloon,Tower Princess,Skeleton Army,Musketeer,Baby Dragon,Valkyrie,Arrows,Witch,Mini P.E.K.K.A,Goblin Barrel,Tower Princess,1
1,Mega Knight,Wizard,Furnace,Boss Bandit,Goblin Barrel,Balloon,Rage,Skeleton Army,Royal Chef,Valkyrie,Royal Recruits,Bandit,The Log,Mega Knight,Musketeer,Arrows,Witch,Dagger Duchess,1
2,Skeletons,Bats,Little Prince,Mother Witch,P.E.K.K.A,Ice Wizard,Goblin Demolisher,Zap,Tower Princess,Valkyrie,Elite Barbarians,Prince,Hog Rider,Goblin Barrel,Skeleton Army,The Log,Musketeer,Tower Princess,1
3,Knight,Goblin Barrel,Ice Spirit,Goblin Gang,Inferno Tower,The Log,Princess,Rocket,Tower Princess,Mega Knight,Firecracker,Valkyrie,Goblin Gang,Magic Archer,The Log,Rocket,Witch,Dagger Duchess,1
4,The Log,Valkyrie,Tesla,Goblin Gang,Princess,Goblin Barrel,Ice Spirit,Rocket,Tower Princess,Tesla,Knight,X-Bow,Rocket,Ice Wizard,Skeletons,The Log,Tornado,Tower Princess,1


In [36]:
p1 = stacked[[col for col in stacked.columns if 'player1' in col]]
p2 = stacked[[col for col in stacked.columns if 'player2' in col]]

In [37]:
def ohe(i):
    melted = i.reset_index().melt(id_vars='index', value_name='card', var_name='slot')
    i = pd.get_dummies(melted.set_index('index')['card'], dtype = int)
    i = i.groupby(level=0).max()
    return i

p1 = ohe(p1)
p2 = ohe(p2)

In [41]:
combo = p1 - p2
combo.head()

Unnamed: 0_level_0,Archer Queen,Archers,Arrows,Baby Dragon,Balloon,Bandit,Barbarian Barrel,Barbarian Hut,Barbarians,Bats,...,Tornado,Tower Princess,Valkyrie,Void,Wall Breakers,Witch,Wizard,X-Bow,Zap,Zappies
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,-1,-1,1,0,0,0,0,0,...,0,0,-1,0,0,-1,0,0,0,0
1,0,0,-1,0,1,-1,0,0,0,0,...,0,0,-1,0,0,-1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,-1,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,-1,0,0,-1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,-1,0,1,0,0,0,0,-1,0,0


In [43]:
X = combo
y = stacked['Win']

X_full, X_test, y_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [44]:
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)
print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred))

Logistic Regression Accuracy: 0.5497415213007105


In [46]:
forest = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    max_features="sqrt",
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=4,
    max_samples=0.8,
    verbose=1
)

forest.fit(X_train, y_train)
y_pred = forest.predict(X_val)
print("Random Forest Accuracy:", accuracy_score(y_val, y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s


Random Forest Accuracy: 0.5617957066987025


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.3s finished


In [51]:
# Scale the data for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_test)

# Define the neural network
mlp = MLPClassifier(
    hidden_layer_sizes=(123, 30, 10), # 3 layers with decreasing size
    activation='relu',
    solver='adam',
    alpha=0.0001,         # L2 regularization
    batch_size=1024,      # Large batches for faster training
    learning_rate='adaptive',
    max_iter=100,         # Number of epochs
    verbose=True,
    n_iter_no_change=10,  # Early stopping
)

# Train the model
mlp.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = mlp.predict(X_val_scaled)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Iteration 1, loss = 0.68539691
Iteration 2, loss = 0.68101410
Iteration 3, loss = 0.67863086
Iteration 4, loss = 0.67671733
Iteration 5, loss = 0.67502430
Iteration 6, loss = 0.67368704
Iteration 7, loss = 0.67253182
Iteration 8, loss = 0.67154569
Iteration 9, loss = 0.67063078
Iteration 10, loss = 0.66989412
Iteration 11, loss = 0.66917461
Iteration 12, loss = 0.66856333
Iteration 13, loss = 0.66797920
Iteration 14, loss = 0.66755938
Iteration 15, loss = 0.66711245
Iteration 16, loss = 0.66670937
Iteration 17, loss = 0.66631394
Iteration 18, loss = 0.66597184
Iteration 19, loss = 0.66565786
Iteration 20, loss = 0.66535892
Iteration 21, loss = 0.66512826
Iteration 22, loss = 0.66481871
Iteration 23, loss = 0.66453119
Iteration 24, loss = 0.66439859
Iteration 25, loss = 0.66413704
Iteration 26, loss = 0.66393133
Iteration 27, loss = 0.66369184
Iteration 28, loss = 0.66357702
Iteration 29, loss = 0.66336816





Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.56      0.56    229998
           1       0.56      0.56      0.56    229904

    accuracy                           0.56    459902
   macro avg       0.56      0.56      0.56    459902
weighted avg       0.56      0.56      0.56    459902

Accuracy: 0.5605976925518915


In [58]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# 1) Sample a subset so tuning is fast
idx = X_train.sample(100000, random_state=42).index.to_list()
X_sub, y_sub = X_train.loc[idx], y_train.loc[idx]

# 2) Build a pipeline
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('pca', PCA()), 
    ('mlp', MLPClassifier(
        activation='relu',
        solver='adam',
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        max_iter=200,
        verbose=False,
        random_state=42
    ))
])

# 3) Define parameter distributions
param_dist = {
    'pca__n_components': [30, 50, 75, 100],
    'mlp__hidden_layer_sizes': [(100,), (100,50), (200,100,50), (123,30,10)],
    'mlp__alpha': uniform(1e-6, 1e-2),
    'mlp__learning_rate_init': uniform(1e-4, 1e-2),
    'mlp__batch_size': [512, 1024, 2048]
}

# 4) Run a random search
search = RandomizedSearchCV(
    pipe,
    param_dist,
    n_iter=20,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=2
)
search.fit(X_sub, y_sub)
print("Best parameters:", search.best_params_)
print("Best CV accuracy:", search.best_score_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
