In [381]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer

In [512]:
df = pd.read_csv('./hearthstone_collectible_df.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
artist,,,,,
attack,,,,,
cardId,HERO_09,HERO_01,HERO_07,HERO_08,HERO_06
cardSet,Basic,Basic,Basic,Basic,Basic
collectible,True,True,True,True,True
cost,,,,,
dbfId,813,7,893,637,274
durability,,,,,
faction,Neutral,Neutral,Neutral,Neutral,Neutral
flavor,,,,,


In [513]:
df.shape

(2012, 27)

## Cleaning

Things to clean up:
 - the elite column seems to be true if the card is a legendary, empty otherwise. Check this, and if so, it's extraneous information, so we can just drop the elite column.
 - Not every card has a race, so we can change NaNs to 'general', as is documented here:
  https://hearthstone.gamepedia.com/Minion
 - Change NaN  in 'text' to empty string (if the card has no text)
 - Faction is either 'Horde', 'Alliance', or 'Neutral' - change all NaNs to neutral.

In [514]:
df[(df['elite'].isna()) & (df['rarity'] == 'Legendary')]

Unnamed: 0,artist,attack,cardId,cardSet,collectible,cost,dbfId,durability,faction,flavor,...,name,playerClass,race,rarity,text,type,elite,classes,multiClassGroup,armor


In [515]:
# they're all the same - we can drop the elite column!

df.drop(columns = ['elite'], inplace = True)

Now let's change the NaNs to general in the race column:

In [516]:
df['race'].value_counts()

Beast        161
Mech         104
Elemental     77
Demon         47
Dragon        44
Murloc        33
Pirate        27
Totem          6
All            1
Name: race, dtype: int64

In [517]:
df['race'] = df['race'].fillna('General')

In [518]:
df['race'].value_counts().sum()

2012

And now the empty strings in text:

In [519]:
df['text'] = df['text'].fillna(' ')

In [520]:
df.text.isnull().sum()

0

Finally the empty strings in faction:

In [521]:
df['faction'] = df['faction'].fillna('Neutral')

In [522]:
df.faction.value_counts()

Neutral     1935
Alliance      51
Horde         26
Name: faction, dtype: int64

And we can drop the redundant 'collectible' column:

In [523]:
df.drop(columns = ['collectible'], inplace = True)

In [524]:
df.shape

(2012, 25)

In [525]:
df[df['durability'].notna()].T

Unnamed: 0,29,73,110,113,114,225,239,264,273,317,...,1831,1834,1851,1904,1908,1925,1942,1958,1959,1981
artist,Glenn Rane,Lucas Graciano,Ryan Sook,Stefan Kopinski,Brian Huang,Nate Bowden,Cyril Van Der Haegen,Daren Bader,Efrem Palacios,John Polidora,...,J. Axer,M. Alvares & M. Azevedo,Jason Kang,Vladimir Kafanov,Vlad Botos,Jakub Kasper,Jim Nelson,Jason Kang,Akkapoj T.,L. Lullabi & K. Turovec
attack,1,3,4,5,3,2,3,2,1,2,...,0,3,4,0,2,0,2,4,3,0
cardId,CS2_091,CS2_106,CS2_097,CS2_112,CS2_080,EX1_247,EX1_536,EX1_133,EX1_366,EX1_567,...,TRL_317,TRL_304,TRL_325,DAL_568,DAL_571,DAL_177,DAL_563,DAL_720,DAL_063,DAL_378
cardSet,Basic,Basic,Basic,Basic,Basic,Classic,Classic,Classic,Classic,Classic,...,Rastakhan's Rumble,Rastakhan's Rumble,Rastakhan's Rumble,Rise of Shadows,Rise of Shadows,Rise of Shadows,Rise of Shadows,Rise of Shadows,Rise of Shadows,Rise of Shadows
cost,1,3,4,5,5,2,3,3,3,5,...,5,5,6,2,2,3,4,4,4,6
dbfId,383,401,847,304,421,960,1662,391,643,352,...,50086,50014,50056,52490,52496,51971,52482,52617,51738,52089
durability,4,2,2,2,4,3,2,2,5,8,...,0,3,4,0,2,0,0,2,2,0
faction,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,...,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral
flavor,Prince Malchezaar was a collector of rare weap...,"During times of tranquility and harmony, this ...","It Slices, it Dices. You can cut a tin can wit...",No… actually you should fear the Reaper.,Guaranteed to have been owned by a real assass...,"Yo, that's a nice axe.",First Lesson: Put the pointy end in the other ...,Perdition's Blade is Ragnaros's back-up weapon...,I dub you Sir Loin of Beef!,Orgrim Doomhammer gave this legendary weapon t...,...,"If you’re burning and you know it, wave your h...",Only two things in life are certain: death and...,"“Griftah here with de Sul’chop. One chop, you ...",Some might call this a lightforgery.,It knows every secret you have left.,If only you’d let it go to voicemail…,"Eager to please, even if it kills him.",Kobolds informally refer to its effect as a de...,Whoso pulleth out this wrench from the toolbox...,The Kirin Tor have always been lax in enforcin...
health,,,,,,,,,,,...,0,,,0,0,,2,,,


In [526]:
df.columns

Index(['artist', 'attack', 'cardId', 'cardSet', 'cost', 'dbfId', 'durability',
       'faction', 'flavor', 'health', 'howToGet', 'howToGetGold', 'img',
       'imgGold', 'locale', 'mechanics', 'name', 'playerClass', 'race',
       'rarity', 'text', 'type', 'classes', 'multiClassGroup', 'armor'],
      dtype='object')

In [527]:
df['cardSet'].value_counts()

Classic                         237
Basic                           142
Journey to Un'Goro              135
Knights of the Frozen Throne    135
The Boomsday Project            135
Rastakhan's Rumble              135
Kobolds & Catacombs             135
Rise of Shadows                 135
Whispers of the Old Gods        134
The Grand Tournament            132
Mean Streets of Gadgetzan       132
The Witchwood                   129
Goblins vs Gnomes               123
The League of Explorers          45
One Night in Karazhan            45
Blackrock Mountain               31
Naxxramas                        30
Hall of Fame                     22
Name: cardSet, dtype: int64

In [528]:
value_count_list = ['attack',  'cardSet', 'cost', 
       'durability', 'faction',  'health', 
       'playerClass', 'race',
       'rarity',  'type',  'classes', 'armor']

for column in value_count_list:
    print(f'The {column} values are: ')
    print(df[column].value_counts())
    print('')
    

The attack values are: 
2.0     335
3.0     308
4.0     208
1.0     195
5.0     167
6.0      73
0.0      69
7.0      52
8.0      33
9.0      14
10.0      4
12.0      3
20.0      1
Name: attack, dtype: int64

The cardSet values are: 
Classic                         237
Basic                           142
Journey to Un'Goro              135
Knights of the Frozen Throne    135
The Boomsday Project            135
Rastakhan's Rumble              135
Kobolds & Catacombs             135
Rise of Shadows                 135
Whispers of the Old Gods        134
The Grand Tournament            132
Mean Streets of Gadgetzan       132
The Witchwood                   129
Goblins vs Gnomes               123
The League of Explorers          45
One Night in Karazhan            45
Blackrock Mountain               31
Naxxramas                        30
Hall of Fame                     22
Name: cardSet, dtype: int64

The cost values are: 
3.0     365
2.0     361
4.0     316
5.0     248
1.0     239
6.0     

In [529]:
df.head(9)

Unnamed: 0,artist,attack,cardId,cardSet,cost,dbfId,durability,faction,flavor,health,...,mechanics,name,playerClass,race,rarity,text,type,classes,multiClassGroup,armor
0,,,HERO_09,Basic,,813,,Neutral,,30.0,...,,Anduin Wrynn,Priest,General,Free,,Hero,,,
1,,,HERO_01,Basic,,7,,Neutral,,30.0,...,,Garrosh Hellscream,Warrior,General,Free,,Hero,,,
2,,,HERO_07,Basic,,893,,Neutral,,30.0,...,,Gul'dan,Warlock,General,Free,,Hero,,,
3,,,HERO_08,Basic,,637,,Neutral,,30.0,...,,Jaina Proudmoore,Mage,General,Free,,Hero,,,
4,,,HERO_06,Basic,,274,,Neutral,,30.0,...,,Malfurion Stormrage,Druid,General,Free,,Hero,,,
5,,,HERO_05,Basic,,31,,Neutral,,30.0,...,,Rexxar,Hunter,General,Free,,Hero,,,
6,,,HERO_02,Basic,,1066,,Neutral,,30.0,...,,Thrall,Shaman,General,Free,,Hero,,,
7,,,HERO_04,Basic,,671,,Neutral,,30.0,...,,Uther Lightbringer,Paladin,General,Free,,Hero,,,
8,,,HERO_03,Basic,,930,,Neutral,,30.0,...,,Valeera Sanguinar,Rogue,General,Free,,Hero,,,


Okay - we drop the first nine rows, because these aren't actually playable cards - these are just your heros.


In [530]:
df.drop([0,1,2,3,4,5,6,7,8], inplace = True)

In [531]:
df.head()

Unnamed: 0,artist,attack,cardId,cardSet,cost,dbfId,durability,faction,flavor,health,...,mechanics,name,playerClass,race,rarity,text,type,classes,multiClassGroup,armor
9,Dan Scott,,CS2_041,Basic,0.0,149,,Neutral,I personally prefer some non-ancestral right-t...,,...,[{'name': 'Taunt'}],Ancestral Healing,Shaman,General,Free,Restore a minion\nto full Health and\ngive it ...,Spell,,,
10,Michael Sutfin,,CS2_072,Basic,0.0,180,,Neutral,"It's funny how often yelling ""Look over there!...",,...,,Backstab,Rogue,General,Free,Deal $2 damage to an undamaged minion.,Spell,,,
11,Doug Alexander,,EX1_169,Basic,0.0,254,,Neutral,Some druids still have flashbacks from strange...,,...,,Innervate,Druid,General,Free,Gain 1 Mana Crystal this turn only.,Spell,,,
12,Richard Wright,,CS2_008,Basic,0.0,467,,Neutral,"""Cast Moonfire, and never stop."" - How to Be a...",,...,,Moonfire,Druid,General,Free,Deal $1 damage.,Spell,,,
13,Jim Nelson,,NEW1_003,Basic,0.0,163,,Neutral,This is the reason that Demons never really be...,,...,,Sacrificial Pact,Warlock,General,Free,Destroy a Demon. Restore #5 Health to your hero.,Spell,,,


In [532]:
df['locale'].value_counts()

enUS    2003
Name: locale, dtype: int64

In [533]:
df['multiClassGroup'].value_counts()

Kabal          3
Jade Lotus     3
Grimy Goons    3
Name: multiClassGroup, dtype: int64

In [534]:
# columns to drop for analysis purposes:

df.drop(columns = ['artist', 'flavor', 'howToGet', 'howToGetGold', "dbfId", 
                   'img', 'imgGold', 'locale', 'classes','multiClassGroup'], inplace = True)


In [535]:
df.head()

Unnamed: 0,attack,cardId,cardSet,cost,durability,faction,health,mechanics,name,playerClass,race,rarity,text,type,armor
9,,CS2_041,Basic,0.0,,Neutral,,[{'name': 'Taunt'}],Ancestral Healing,Shaman,General,Free,Restore a minion\nto full Health and\ngive it ...,Spell,
10,,CS2_072,Basic,0.0,,Neutral,,,Backstab,Rogue,General,Free,Deal $2 damage to an undamaged minion.,Spell,
11,,EX1_169,Basic,0.0,,Neutral,,,Innervate,Druid,General,Free,Gain 1 Mana Crystal this turn only.,Spell,
12,,CS2_008,Basic,0.0,,Neutral,,,Moonfire,Druid,General,Free,Deal $1 damage.,Spell,
13,,NEW1_003,Basic,0.0,,Neutral,,,Sacrificial Pact,Warlock,General,Free,Destroy a Demon. Restore #5 Health to your hero.,Spell,


## Get dummies for:
  - cardSet
  - faction
  - mechanics (fix this)
  - playerClass
  - race
  - rarity (but we can make this 0-4, since indeed legendary is more rare than common)
  - type
  
Also maybe replace the NaNs with -1? This could solve our NaN problem. 

In [536]:
#reset the index
df.reset_index(inplace = True)
df.drop(columns = ['index'], inplace = True)
df.head()

Unnamed: 0,attack,cardId,cardSet,cost,durability,faction,health,mechanics,name,playerClass,race,rarity,text,type,armor
0,,CS2_041,Basic,0.0,,Neutral,,[{'name': 'Taunt'}],Ancestral Healing,Shaman,General,Free,Restore a minion\nto full Health and\ngive it ...,Spell,
1,,CS2_072,Basic,0.0,,Neutral,,,Backstab,Rogue,General,Free,Deal $2 damage to an undamaged minion.,Spell,
2,,EX1_169,Basic,0.0,,Neutral,,,Innervate,Druid,General,Free,Gain 1 Mana Crystal this turn only.,Spell,
3,,CS2_008,Basic,0.0,,Neutral,,,Moonfire,Druid,General,Free,Deal $1 damage.,Spell,
4,,NEW1_003,Basic,0.0,,Neutral,,,Sacrificial Pact,Warlock,General,Free,Destroy a Demon. Restore #5 Health to your hero.,Spell,


In [537]:
df['mechanics'].value_counts()

[{'name': 'Battlecry'}]                                                                                          392
[{'name': 'Deathrattle'}]                                                                                        158
[{'name': 'Taunt'}]                                                                                               70
[{'name': 'Secret'}]                                                                                              42
[{'name': 'Battlecry'}, {'name': 'Taunt'}]                                                                        29
[{'name': 'Aura'}]                                                                                                27
[{'name': 'Stealth'}]                                                                                             27
[{'name': 'Spell Damage'}]                                                                                        22
[{'name': 'Combo'}]                                             

We have strings of lists of dictionaries, and we need the values.... YIKES!
Thanks to Ritchie, we have this package that should help:

In [538]:
import ast

In [539]:
# testing it out:
ast.literal_eval(df['mechanics'][0])[0]['name']

#YES!

'Taunt'

In [540]:
# help from here, because this WAS THE WORST. 
# https://stackoverflow.com/questions/52232742/how-to-use-ast-literal-eval-in-a-pandas-dataframe-and-handle-exceptions

def literal_return(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        return val

In [541]:
df['mechanics'] = df['mechanics'].apply(literal_return)

In [542]:
df[df['name']=='Zilliax']

Unnamed: 0,attack,cardId,cardSet,cost,durability,faction,health,mechanics,name,playerClass,race,rarity,text,type,armor
1702,3.0,BOT_548,The Boomsday Project,5.0,,Neutral,2.0,"[{'name': 'Magnetic'}, {'name': 'Divine Shield...",Zilliax,Neutral,Mech,Legendary,"<b>Magnetic</b>\n<b><b>Divine Shield</b>, <b>T...",Minion,


In [543]:
def get_mechanics(row):
    try:
        return [list(attr.values())[0] for attr in row]
    except:
        return np.nan

df_mechanics_list = df['mechanics'].apply(get_mechanics)

In [544]:
df_mechanics_list = pd.DataFrame(df_mechanics_list)

In [545]:
df_mechanics_list['mechanics'][1984]

['Battlecry', 'Discover']

In [546]:
# https://stackoverflow.com/questions/29034928/pandas-convert-a-column-of-list-to-dummies

df_mech_all = pd.get_dummies(df_mechanics_list['mechanics'].apply(pd.Series).stack()).sum(level=0).add_prefix('mech_')

In [547]:
df_mech_all.head()

Unnamed: 0,mech_Adapt,mech_AdjacentBuff,mech_AffectedBySpellPower,mech_Aura,mech_Battlecry,mech_Charge,mech_Combo,mech_Deathrattle,mech_Discover,mech_Divine Shield,...,mech_Quest,mech_Recruit,mech_Rush,mech_Secret,mech_Silence,mech_Spell Damage,mech_Stealth,mech_Taunt,mech_Twinspell,mech_Windfury
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [548]:
df = df.join(df_mech_all, how = 'outer')

In [549]:
df.head()

Unnamed: 0,attack,cardId,cardSet,cost,durability,faction,health,mechanics,name,playerClass,...,mech_Quest,mech_Recruit,mech_Rush,mech_Secret,mech_Silence,mech_Spell Damage,mech_Stealth,mech_Taunt,mech_Twinspell,mech_Windfury
0,,CS2_041,Basic,0.0,,Neutral,,[{'name': 'Taunt'}],Ancestral Healing,Shaman,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,,CS2_072,Basic,0.0,,Neutral,,,Backstab,Rogue,...,,,,,,,,,,
2,,EX1_169,Basic,0.0,,Neutral,,,Innervate,Druid,...,,,,,,,,,,
3,,CS2_008,Basic,0.0,,Neutral,,,Moonfire,Druid,...,,,,,,,,,,
4,,NEW1_003,Basic,0.0,,Neutral,,,Sacrificial Pact,Warlock,...,,,,,,,,,,


In [550]:
df = pd.concat([df, pd.get_dummies(df['cardSet'], prefix='cardset')],axis=1)
df = pd.concat([df, pd.get_dummies(df['faction'], prefix='faction')],axis=1)
df = pd.concat([df, pd.get_dummies(df['playerClass'], prefix='playerclass')],axis=1)
df = pd.concat([df, pd.get_dummies(df['race'], prefix='race')],axis=1)
df = pd.concat([df, pd.get_dummies(df['type'], prefix='type')],axis=1)

df.drop(columns=['cardSet','faction', 'playerClass','race','type'], inplace = True)
df.head()

Unnamed: 0,attack,cardId,cost,durability,health,mechanics,name,rarity,text,armor,...,race_Elemental,race_General,race_Mech,race_Murloc,race_Pirate,race_Totem,type_Hero,type_Minion,type_Spell,type_Weapon
0,,CS2_041,0.0,,,[{'name': 'Taunt'}],Ancestral Healing,Free,Restore a minion\nto full Health and\ngive it ...,,...,0,1,0,0,0,0,0,0,1,0
1,,CS2_072,0.0,,,,Backstab,Free,Deal $2 damage to an undamaged minion.,,...,0,1,0,0,0,0,0,0,1,0
2,,EX1_169,0.0,,,,Innervate,Free,Gain 1 Mana Crystal this turn only.,,...,0,1,0,0,0,0,0,0,1,0
3,,CS2_008,0.0,,,,Moonfire,Free,Deal $1 damage.,,...,0,1,0,0,0,0,0,0,1,0
4,,NEW1_003,0.0,,,,Sacrificial Pact,Free,Destroy a Demon. Restore #5 Health to your hero.,,...,0,1,0,0,0,0,0,0,1,0


In [551]:
df.head().T

Unnamed: 0,0,1,2,3,4
attack,,,,,
cardId,CS2_041,CS2_072,EX1_169,CS2_008,NEW1_003
cost,0,0,0,0,0
durability,,,,,
health,,,,,
mechanics,[{'name': 'Taunt'}],,,,
name,Ancestral Healing,Backstab,Innervate,Moonfire,Sacrificial Pact
rarity,Free,Free,Free,Free,Free
text,Restore a minion\nto full Health and\ngive it ...,Deal $2 damage to an undamaged minion.,Gain 1 Mana Crystal this turn only.,Deal $1 damage.,Destroy a Demon. Restore #5 Health to your hero.
armor,,,,,


We are finally almost done - now we replace all NaNs in the attack, durability, health, and armor with -1:

In [552]:
df['attack'] = df['attack'].fillna(-1)
df['durability'] = df['durability'].fillna(-1)
df['health'] = df['health'].fillna(-1)
df['armor'] = df['armor'].fillna(-1)


Map the rarities to values 0-4:

In [553]:
df['rarity'].value_counts()

Common       713
Rare         519
Epic         339
Legendary    299
Free         133
Name: rarity, dtype: int64

In [554]:
df['rarity_value'] = df['rarity'].map({'Free':0, 
                  'Common':1,
                 'Rare':2,
                 'Epic':3,
                 'Legendary':4})

In [555]:
# drop mechanics columns

df.drop(columns=['mechanics'], inplace = True)

Check we've gotten rid of every NaN except the ones in the mech_ columns, then replace all those NaNs with 0.

In [556]:
df = df.fillna(0)

In [557]:
df.isna().sum().sum()

0

FINALLLYYYYY we can save this to a dataframe!!!

In [558]:
df.head()

Unnamed: 0,attack,cardId,cost,durability,health,name,rarity,text,armor,mech_Adapt,...,race_General,race_Mech,race_Murloc,race_Pirate,race_Totem,type_Hero,type_Minion,type_Spell,type_Weapon,rarity_value
0,-1.0,CS2_041,0.0,-1.0,-1.0,Ancestral Healing,Free,Restore a minion\nto full Health and\ngive it ...,-1.0,0.0,...,1,0,0,0,0,0,0,1,0,0
1,-1.0,CS2_072,0.0,-1.0,-1.0,Backstab,Free,Deal $2 damage to an undamaged minion.,-1.0,0.0,...,1,0,0,0,0,0,0,1,0,0
2,-1.0,EX1_169,0.0,-1.0,-1.0,Innervate,Free,Gain 1 Mana Crystal this turn only.,-1.0,0.0,...,1,0,0,0,0,0,0,1,0,0
3,-1.0,CS2_008,0.0,-1.0,-1.0,Moonfire,Free,Deal $1 damage.,-1.0,0.0,...,1,0,0,0,0,0,0,1,0,0
4,-1.0,NEW1_003,0.0,-1.0,-1.0,Sacrificial Pact,Free,Destroy a Demon. Restore #5 Health to your hero.,-1.0,0.0,...,1,0,0,0,0,0,0,1,0,0


In [559]:
df.to_csv('dummies_df_card_list.csv', index = False)