## Imports

In [None]:
import pandas as pd
import plotly.express as px
import json
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap
from sklearn.neighbors import NearestNeighbors
from concurrent.futures import ThreadPoolExecutor

## Load and Prep Dataset

In [2]:
df = pd.read_parquet('full_clash_battles_zstd.parquet')

Since we're encoding a vector space of decks, let's just take the cards and levels and make one big dataset out of them

In [3]:
winners = df[[col for col in df.columns if 'winner' in col]].copy().rename(columns=lambda x: x.replace('winner', 'player'))
winners = winners[[col for col in winners.columns if 'level' not in col]]
winners['won'] = 1
losers = df[[col for col in df.columns if 'loser' in col]].copy().rename(columns=lambda x: x.replace('loser', 'player'))
losers = losers[[col for col in losers.columns if 'level' not in col]]
losers['won'] = 0

decks = pd.concat([winners, losers], ignore_index=True)

## One Hot Encoding Dataset

Now, let's get the actual card names in there

In [4]:
num_games = 100000

deck_sample = decks.sample(num_games, random_state=42).drop('player', axis=1).reset_index(drop=True)

with open('card_mappings.json') as f:
    card_mappings = json.load(f)

for col in deck_sample.columns:
    if col.endswith('_id'):
        deck_sample[col] = deck_sample[col].astype(str).map(card_mappings)

deck_sample.head()

Unnamed: 0,player_card_1_id,player_card_2_id,player_card_3_id,player_card_4_id,player_card_5_id,player_card_6_id,player_card_7_id,player_card_8_id,player_tower_card_id,won
0,Royal Giant,Knight,Hog Rider,Fireball,Musketeer,Electro Wizard,Executioner,Bowler,Tower Princess,0
1,Valkyrie,Royal Recruits,Goblin Barrel,Arrows,Mirror,Ice Wizard,Flying Machine,Fireball,Tower Princess,0
2,Valkyrie,Mega Knight,Skeleton Army,The Log,Ice Spirit,Goblin Barrel,Princess,Witch,Tower Princess,1
3,Giant,Musketeer,Mini P.E.K.K.A,Minions,Valkyrie,Arrows,Knight,Fireball,Tower Princess,1
4,Giant Skeleton,Wizard,Goblin Gang,Firecracker,Bats,The Log,Lumberjack,Zap,Tower Princess,1


Now let's create one column for each card type and OHE this thang

In [5]:
melted = deck_sample.drop(columns='won').reset_index().melt(id_vars='index', value_name='card', var_name='slot')
ohe_df = pd.get_dummies(melted.set_index('index')['card'])
ohe_df = ohe_df.groupby(level=0).max()
ohe_df['won'] = deck_sample['won'].values
ohe_df.head()

Unnamed: 0_level_0,Archer Queen,Archers,Arrows,Baby Dragon,Balloon,Bandit,Barbarian Barrel,Barbarian Hut,Barbarians,Bats,...,Tower Princess,Valkyrie,Void,Wall Breakers,Witch,Wizard,X-Bow,Zap,Zappies,won
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,0
1,False,False,True,False,False,False,False,False,False,False,...,True,True,False,False,False,False,False,False,False,0
2,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,True,False,False,False,False,1
3,False,False,True,False,False,False,False,False,False,False,...,True,True,False,False,False,False,False,False,False,1
4,False,False,False,False,False,False,False,False,False,True,...,True,False,False,False,False,True,False,True,False,1


## Vectorizing Data

Let's scale the data for easier vectorization

In [6]:
X = ohe_df.values  

scaler = StandardScaler(with_mean=True, with_std=True)
X_scaled = scaler.fit_transform(X)

123 dimensions is crazy high, so let's try to break this down into a smaller vector space to condense it to different deck types.

In [7]:
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_scaled)

Let's do TSNE to lower the components down to a scatterplot-able set of data.

In [8]:
X_umap = umap.UMAP(
    n_neighbors=15, 
    min_dist=0.1, 
    metric="cosine",
    transform_seed=42,
).fit_transform(X_pca)

  from .autonotebook import tqdm as notebook_tqdm
2025-05-08 20:23:20.616515: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Creating Extra Variables

Currently, we only have winner labels and tower labels to color the data by. Let's fix that.

In [9]:
#Elixir Cost

with open('card_info.json') as f:
    card_info = json.load(f)

name_to_elixir = {
    item['name']: item['elixirCost']
    for category in ['items']
    for item in card_info[category]
    if 'elixirCost' in item
}

deck_cost = sum([deck_sample[f'player_card_{i}_id'].map(lambda x: name_to_elixir.get(x, 0)) for i in range(1, 9)])/8
deck_sample['avg_elixir'] = deck_cost

In [10]:
#Is Win Condition

wincons = ["Skeleton Barrel", "Mortar", "Royal Giant", "Elixir Golem", "Battle Ram", "Hog Rider", "Giant", "Royal Hogs", "Three Musketeers", "Wall Breakers", "Goblin Barrel", "Goblin Drill", "Balloon", "Goblin Giant", "X-Bow", "Electro Giant", "Golem", "Miner", "Ram Rider", "Graveyard", "Lava Hound"]

def get_first_type(row, card_type, sort = max):
    matches = []
    for col in row:
        if col in card_type:
            matches.append(col)
    return sort(matches, default=None)

def get_card_type(df, card_type, sort = max):
    # Apply the function to each row of the player cards columns
    return df[['player_card_1_id', 'player_card_2_id', 'player_card_3_id', 'player_card_4_id', 'player_card_5_id', 'player_card_6_id', 'player_card_7_id', 'player_card_8_id']].apply(lambda row: get_first_type(row, card_type), axis=1)

deck_sample['win_condition'] = get_card_type(deck_sample, wincons, sort=max)

In [18]:
def find_shared_cards_multithreaded(df, x, y, n_neighbors=5, common_set_size=3, num_threads=8):
    df['x'] = x
    df['y'] = y
    coordinates = df[['x', 'y']].values
    
    nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
    nn_model.fit(coordinates)
    
    distances, indices = nn_model.kneighbors(coordinates)
    
    card_columns = [f'player_card_{i}_id' for i in range(1, 9)]
    
    def get_common_cards(idx):
        neighbor_indices = indices[idx]
        all_neighbor_cards = df.iloc[neighbor_indices][card_columns].values.flatten()
        card_counts = pd.Series(all_neighbor_cards).value_counts()
        return card_counts.head(common_set_size).index.tolist()
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        shared_cards_list = list(executor.map(get_common_cards, range(len(df))))
        
    return [sorted(i) for i in shared_cards_list]

def knn_cards(df, set_size, neighbors):
    x = find_shared_cards_multithreaded(df, X_umap[:, 0], X_umap[:, 1], n_neighbors=neighbors, common_set_size=set_size)
    x = pd.Series(x).astype(str)
    return x

In [19]:
deck_sample['shared_cards'] = knn_cards(deck_sample, 1, 10)
deck_sample['shared_cards_2'] = knn_cards(deck_sample, 2, 10)

## Plotting

Let's plot this jawn

In [25]:
df_vis = pd.DataFrame({
    'UMAP 1':  X_umap[:, 0],
    'UMAP 2':  X_umap[:, 1],
    'Tower':   deck_sample.player_tower_card_id,
    'Winner':  deck_sample.won,
    'Elixir': deck_sample.avg_elixir,
    'WinCon': deck_sample.win_condition,
    'Shared1': deck_sample.shared_cards,
    'Shared2': deck_sample.shared_cards_2,
}, index=deck_sample.index)

color_by = "Elixir"

fig = px.scatter(
    df_vis,
    x='UMAP 1',
    y='UMAP 2',
    color=color_by,           
    #symbol=color_by,           
    hover_data=[color_by],     
    title='UMAP projection of Clash Royale decks',
    width=800,
    height=800
)
fig.update_traces(marker={'size': 4, 'opacity': 0.6})
fig.show()
