## Imports

In [19]:
import pandas as pd
import json

## Load and Prep Dataset

In [3]:
df = pd.read_parquet('full_clash_battles_zstd.parquet')

Since we're encoding a vector space of decks, let's just take the cards and levels and make one big dataset out of them

In [12]:
winners = df[[col for col in df.columns if 'winner' in col]].copy().rename(columns=lambda x: x.replace('winner', 'player'))
winners = winners[[col for col in winners.columns if 'level' not in col]]
winners['won'] = 1
losers = df[[col for col in df.columns if 'loser' in col]].copy().rename(columns=lambda x: x.replace('loser', 'player'))
losers = losers[[col for col in losers.columns if 'level' not in col]]
losers['won'] = 0

decks = pd.concat([winners, losers], ignore_index=True)

Now, let's get the actual card names in there

In [37]:
num_games = 100000

deck_sample = decks.sample(num_games, random_state=42).drop('player', axis=1).reset_index(drop=True)

with open('card_mappings.json') as f:
    card_mappings = json.load(f)

for col in deck_sample.columns:
    if col.endswith('_id'):
        deck_sample[col] = deck_sample[col].astype(str).map(card_mappings)

deck_sample.head()

Unnamed: 0,player_card_1_id,player_card_2_id,player_card_3_id,player_card_4_id,player_card_5_id,player_card_6_id,player_card_7_id,player_card_8_id,player_tower_card_id,won
0,Royal Giant,Knight,Hog Rider,Fireball,Musketeer,Electro Wizard,Executioner,Bowler,Tower Princess,0
1,Valkyrie,Royal Recruits,Goblin Barrel,Arrows,Mirror,Ice Wizard,Flying Machine,Fireball,Tower Princess,0
2,Valkyrie,Mega Knight,Skeleton Army,The Log,Ice Spirit,Goblin Barrel,Princess,Witch,Tower Princess,1
3,Giant,Musketeer,Mini P.E.K.K.A,Minions,Valkyrie,Arrows,Knight,Fireball,Tower Princess,1
4,Giant Skeleton,Wizard,Goblin Gang,Firecracker,Bats,The Log,Lumberjack,Zap,Tower Princess,1


Now let's create one column for each card type and OHE this thang

In [64]:
melted = deck_sample.drop(columns='won').reset_index().melt(id_vars='index', value_name='card', var_name='slot')
ohe_df = pd.get_dummies(melted.set_index('index')['card'])
ohe_df = ohe_df.groupby(level=0).max()
ohe_df['won'] = deck_sample['won'].values
ohe_df.head()

Unnamed: 0_level_0,Archer Queen,Archers,Arrows,Baby Dragon,Balloon,Bandit,Barbarian Barrel,Barbarian Hut,Barbarians,Bats,...,Tower Princess,Valkyrie,Void,Wall Breakers,Witch,Wizard,X-Bow,Zap,Zappies,won
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,0
1,False,False,True,False,False,False,False,False,False,False,...,True,True,False,False,False,False,False,False,False,0
2,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,True,False,False,False,False,1
3,False,False,True,False,False,False,False,False,False,False,...,True,True,False,False,False,False,False,False,False,1
4,False,False,False,False,False,False,False,False,False,True,...,True,False,False,False,False,True,False,True,False,1
