In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import sklearn as skl
import plotly as plt
import csv

plt.offline.init_notebook_mode(connected=True)
pd.options.mode.chained_assignment = None
np.set_printoptions(linewidth = 95)

# Loading and preprocessing the data

First, we load the data, doing some preprocessing, preserve the columns we are interested, and rename them in the process:

In [None]:
data_path = "/kaggle/input/pokemon-database/Pokemon Database.csv"

df_raw = pd.read_csv(data_path, encoding='cp1252')
df_raw = df_raw.set_index('Pokemon Id')
df_raw.loc[df_raw['Original Pokemon ID'].notna(),'Legendary Type'] = \
    list(df_raw.loc()[df_raw[df_raw['Original Pokemon ID'].notna()]['Original Pokemon ID']]['Legendary Type'])
df_raw.head()

In [None]:
df_raw.columns

In [None]:
column_name_dict = {
    'Pokedex Number': 'nid', 
    'Pokemon Name': 'name', 
    'Alternate Form Name': 'form',
    'Legendary Type': 'legendary', 
    'Pokemon Height': 'height', 
    'Pokemon Weight': 'weight', 
    'Primary Type': 'type_1', 
    'Secondary Type': 'type_2',
    'Health Stat': 'hp', 
    'Attack Stat': 'atk', 
    'Defense Stat': 'def', 
    'Special Attack Stat': 'satk', 
    'Special Defense Stat': 'sdef', 
    'Speed Stat': 'spd', 
    'EV Yield Total': 'ev_total', 
    'Pre-Evolution Pokemon Id': 'prev_id'
}

df = df_raw[column_name_dict.keys()]
df.columns = column_name_dict.values()
df = df.fillna(value={'form': '', 'legendary': ''})
df.type_2[df.type_2.isna()] = df.type_1[df.type_2.isna()] 
df = df[df.form==''].drop(columns=['form'])
df = df[df.prev_id.isin(df.index) | df.prev_id.isna()]
df.head(10)

For the sake of simplicity I only keep the base forms i.e. no regional forms, alternate forms etc. and for I'm going to do later I also need to remove the Pokemon after those Pokemon, Sirfetch'd for example. 
Before doing normalization, we calculate the statistics (especially min and max) to determine how to normalize the data:  

In [None]:
sorted(list(set(df.legendary)))

In [None]:
SIZES = ['height', 'weight']
TYPES = ['type_1', 'type_2']
TYPE_LIST = sorted(list(set(df.type_1)))
LEGENDARY_TYPE_LIST = sorted(list(set(df.legendary)))
STATS = ['hp', 'atk', 'def', 'satk', 'sdef', 'spd']
display(df[SIZES].describe().T.drop(columns=['count']).style.set_caption('Stats for heights and weights'))
display(df[STATS].describe().T.drop(columns=['count']).style.set_caption('Stats for base stats'))

Since the height and weight have more log-like distribution, here I introduce some function for normalizing and reverse normalizing: 

In [None]:
def log_normalize(arr, mid, r_scale):
    return np.log(arr/mid)/r_scale

def log_rev_normalize(arr, mid, r_scale):
    return np.exp(arr*r_scale)*mid

display(log_normalize(df[['height']], 1,3).describe().T.drop(columns=['count']).style.set_caption('Stats for heights'))
display(log_normalize(df[['weight']],10,5).describe().T.drop(columns=['count']).style.set_caption('Stats for weights'))

And then to make the names easier to deal with, I turn the name of all Pokemon to lowercase, but I still need to deal names with some non-alphabetic characters: 

In [None]:
import re
df.name = df.name.apply(lambda n: n.lower())
name_is_special = [len(re.sub('[a-z]', '', n.lower())) > 0 for n in df.name]
df.name[name_is_special]

In [None]:
df.name[name_is_special] = df.name[name_is_special].apply(lambda n: 
                                                          re.sub('_\((.).+\)', '_\g<1>', 
                                                          re.sub("[-\ 2:']+", '_', 
                                                                 n.replace('.', '').replace('Ã©', 'e')))
                                                         )
df.name[name_is_special]

After that, now we observe the length of the modified names: 

In [None]:
df.name.reindex(df.name.str.len().sort_values(ascending=True).index)

In [None]:
df.name.str.len().describe().drop(['count'])

# Model 1: Autoencoder on stats, sizes, types, and legendary type

## Data normalization

First, we make the functions that can go back and forth between the raw data and the training data for the network: 

In [None]:
def df_to_arrays(df_in): 
    stats = df_in[STATS]
    types = df_in[TYPES]
    legendary = df_in['legendary']
    
    height = log_normalize(df_in[['height']], 1,3)
    weight = log_normalize(df_in[['weight']],10,5)
    stats_norm = np.asarray(stats)/256
    types_onehot = np.equal.outer(np.asarray(types), TYPE_LIST).astype(np.float)
    legendaty_onehot = np.equal.outer(np.asarray(legendary), LEGENDARY_TYPE_LIST).astype(np.float)
    
    return (stats_norm, np.concatenate([height, weight], axis=1), 
            types_onehot[:,0,:], types_onehot[:,1,:],
            legendaty_onehot)

def arrays_to_df(stats_norm, sizes, 
                 type1_onehot, type2_onehot, legendaty_onehot): 
    stats = np.round(stats_norm*256).astype(np.int)
    type1 = np.array(TYPE_LIST)[np.argmax(type1_onehot,axis=1)]
    type2 = np.array(TYPE_LIST)[np.argmax(type2_onehot,axis=1)]
    height = log_rev_normalize(sizes[:,0], 1,3)
    weight = log_rev_normalize(sizes[:,1],10,5)
    legendary = np.array(LEGENDARY_TYPE_LIST)[np.argmax(legendaty_onehot,axis=1)]
    
    return pd.concat([pd.DataFrame(stats, columns=STATS),
                      pd.DataFrame(np.stack([type1,type2],axis=1), columns=TYPES),
                      pd.DataFrame(np.stack([height,weight],axis=1), columns=SIZES),
                      pd.DataFrame(legendary, columns=['legendary']),
                     ], 
                     axis=1)

df_in = df[df.name.isin(['skarmory', 'lugia', 'rayquaza'])]
display(df_in.style.set_caption('before normalization'))

arrays = df_to_arrays(df_in)
print('after normalization')
display(arrays)
print('shapes: ', [a.shape for a in arrays])

display(arrays_to_df(*arrays).style.set_caption('after reverse normalization'))

## Data preparation

In [None]:
data_all_array = df_to_arrays(df)
display(data_all_array)
print([a.shape for a in data_all_array])

## Model definition

In [None]:
from keras.layers import Input
from keras.layers import Dense, Concatenate
from keras.models import Model

stats_num = len(STATS)
sizes_num = len(SIZES)
types_num = len(TYPE_LIST)
legendary_num = len(LEGENDARY_TYPE_LIST)
hidden_dim = [64, 32, 16]
encode_dim = 8

# encoder
input_stats = Input(shape=(stats_num,), name='input_st')
input_sizes = Input(shape=(sizes_num,), name='input_sz')
input_type1 = Input(shape=(types_num,), name='input_t1')
input_type2 = Input(shape=(types_num,), name='input_t2')
input_legend = Input(shape=(legendary_num,), name='input_lg')

inputs = Concatenate(name='concat_in')([input_stats, input_sizes, input_type1, input_type2, input_legend])

for i, dim in enumerate(hidden_dim):
    if i==0:
        enc_hidden = Dense(dim, activation='elu', name='hidden_1_en')(inputs)
    else:
        enc_hidden = Dense(dim, activation='elu', name=f'hidden_{i+1}_en')(enc_hidden)

enc_latent = Dense(encode_dim, activation='softsign', name='output_en')(enc_hidden)

encoder_model = Model(inputs=(input_stats, input_sizes, 
                              input_type1, input_type2, input_legend), 
                      outputs=enc_latent, 
                      name='encoder')
encoder_model.summary()
print()

# decoder
input_latent = Input(shape=(encode_dim,), name='input_lt')

for i, dim in enumerate(hidden_dim[::-1]):
    if i==0:
        dec_hidden = Dense(dim, activation='elu', name='hidden_1_de')(input_latent)
    else:
        dec_hidden = Dense(dim, activation='elu', name=f'hidden_{i+1}_de')(dec_hidden)

dec_stats = Dense(stats_num, activation='sigmoid', name='output_st')(dec_hidden)
dec_sizes = Dense(sizes_num, activation='sigmoid', name='output_sz')(dec_hidden)
dec_type1 = Dense(types_num, activation='softmax', name='output_t1')(dec_hidden)
dec_type2 = Dense(types_num, activation='softmax', name='output_t2')(dec_hidden)
dec_legend = Dense(legendary_num, activation='softmax', name='output_lg')(dec_hidden)

decoder_model = Model(inputs=input_latent,
                      outputs=(dec_stats, dec_sizes, 
                               dec_type1, dec_type2, dec_legend), 
                      name='decoder')
decoder_model.summary()
print()

# autoencoder
autoencoder_model = Model(inputs=encoder_model.input, outputs=decoder_model(encoder_model.output), name='autonencoder')
autoencoder_model.summary()
print()

output_names = ['stats', 'sizes', 'type1', 'type2', 'legendary']
output_names_dict = dict(zip(output_names, autoencoder_model.output_names))
output_names_rev_dict = dict(zip(autoencoder_model.output_names, output_names))

def output_name_map(output_names_dict, mapped):
    return {output_names_dict[k]:mapped[k] for k in mapped.keys()}

losses = {'stats': 'mean_absolute_error',
          'sizes': 'mean_absolute_error',
          'type1': 'categorical_crossentropy',
          'type2': 'categorical_crossentropy',
          'legendary': 'categorical_crossentropy',}
losses = output_name_map(output_names_dict, losses)

loss_weights = {'stats': 50.,
                'sizes': 30.,
                'type1': .5,
                'type2': .5,
                'legendary': .2}
loss_weights = output_name_map(output_names_dict, loss_weights)

metrics = {'type1': 'categorical_accuracy',
           'type2': 'categorical_accuracy',
           'legendary': 'categorical_accuracy'}
metrics = output_name_map(output_names_dict, metrics)

autoencoder_model.compile(optimizer='adam', 
                          loss=losses, loss_weights=loss_weights, 
                          metrics=metrics)

## Train the model

In [None]:
from tqdm.keras import TqdmCallback

load_if_avalible = True
weight_path = '/kaggle/working/model_1_weight.h5'

if (not load_if_avalible) or (not os.path.exists(weight_path)): 
    train_history = autoencoder_model.fit(data_all_array, data_all_array,
                                          epochs=20_000+1, batch_size=512, shuffle=True, 
                                          verbose=0, callbacks=[TqdmCallback(verbose=0)])
    autoencoder_model.save_weights(weight_path)
else: 
    autoencoder_model.load_weights(weight_path)

## Evaluate the fitting result

In [None]:
import plotly.express as px

history = {}
for k in train_history.history.keys():
    if 'loss' in k:
        if k == 'loss':
            history[k] = train_history.history[k]
        else:
            nk = output_names_rev_dict[k[:-len('_loss')]]+'_loss'
            history[nk] = train_history.history[k]
    if 'categorical_accuracy' in k:
        nk = output_names_rev_dict[k[:-len('_categorical_accuracy')]]+'_categorical_accuracy'
        history[nk] = train_history.history[k]

display(
    px.line({m: history[m][::100] 
              for m in filter(lambda s: 'loss' in s, history.keys())})\
                .update_layout(xaxis_title='epoch/100', yaxis_title='loss')
    )

display(
    px.line({m: history[m][::100] 
              for m in filter(lambda s: 'accuracy' in s, history.keys())})\
                .update_layout(xaxis_title='epoch/100', yaxis_title='accuracy')
    )

In [None]:
df_in = df[df.name.isin(['skarmory', 'lugia', 'rayquaza', 'archeops', 'latias', 'latios', 'arceus', 'keldeo'])]

display(df_in[STATS+TYPES+SIZES+['legendary']]
        .style.set_caption('input data').format({'height': '{:.1f}', 'weight': '{:.1f}'}))

latent_vector = encoder_model.predict(df_to_arrays(df_in))
display(pd.DataFrame(latent_vector).style.set_caption('latent vector'))
                                          
display(arrays_to_df(*decoder_model.predict(latent_vector))
       .style.set_caption('output data').format({'height': '{:.1f}', 'weight': '{:.1f}'}))

In [None]:
recons_df = arrays_to_df(*autoencoder_model.predict(data_all_array))
recons_df.index = df.index
recons_df

## Inspect the latent space results

In [None]:
latent_vector_all = encoder_model.predict(data_all_array)

latent_df = pd.DataFrame(latent_vector_all, 
                         columns=[f'lt{i}' for i in range(encode_dim)],
                        index=df.index)
display(latent_df.describe().T.drop(columns=['count']))

In [None]:
lt_df_plot = pd.concat([df[['nid', 'name']+STATS+SIZES+TYPES+['ev_total', 'legendary']], 
                        latent_df],
                        axis=1).copy()

lt_df_plot.legendary[lt_df_plot.legendary == ''] = 'None'

In [None]:
import ipywidgets as widgets

TYPE_COLOR_MAP = {
    'Bug': 'lightgreen', 
    'Dark': 'black', 
    'Dragon': 'blue', 
    'Electric': 'yellow', 
    'Fairy': 'fuchsia', 
    'Fighting': 'orange', 
    'Fire': 'red', 
    'Flying': 'skyblue', 
    'Ghost': 'midnightblue', 
    'Grass': 'green', 
    'Ground': 'brown', 
    'Ice': 'aqua', 
    'Normal': 'gray', 
    'Poison': 'purple', 
    'Psychic': 'violet', 
    'Rock': 'teal', 
    'Steel': 'silver', 
    'Water': 'navy', 
}

def show_pcs_fig(df):
    def show_pcs_fig_df(x_axis, y_axis, color):
        fig = px.scatter(df, x=x_axis, y=y_axis, 
                         color=color, size='ev_total', 
                         hover_data=['name','legendary'],
                         size_max=6, 
                         color_discrete_map=TYPE_COLOR_MAP,
                         category_orders={'type_1': TYPE_LIST,
                                          'type_2': TYPE_LIST})
        return fig
    return show_pcs_fig_df

latent_str = [f'lt{i}' for i in range(encode_dim)]+STATS+SIZES
lt_x_dropdown = widgets.Dropdown(options=latent_str, value=latent_str[0])
lt_y_dropdown = widgets.Dropdown(options=latent_str, value=latent_str[1])
class_dropdown = widgets.Dropdown(options=['type_1', 'type_2', 'legendary'], value='type_1')

_ = widgets.interact(show_pcs_fig(lt_df_plot), x_axis=lt_x_dropdown, y_axis=lt_y_dropdown, color=class_dropdown)

## Apply PCA to the latent space vectors

In [None]:
latent_vector_all = encoder_model.predict(data_all_array)

latent_df = pd.DataFrame(latent_vector_all, columns=[f'lt{i}' for i in range(encode_dim)])
latent_df.describe().T.drop(columns=['count'])

In [None]:
from sklearn.decomposition import PCA

def normalize(df, population=None):
    if population is None:
        population = df
    df_desc = population.describe().loc()[['mean', 'std']]
    return (df-df_desc.loc['mean'])/df_desc.loc['std']

def rev_normalize(df, population=None):
    if population is None:
        population = df
    df_desc = population.describe().loc()[['mean', 'std']]
    return (df*df_desc.loc['std'])+df_desc.loc['mean']

pca = PCA(random_state=227)
pca.fit(normalize(latent_df))
pcs = pca.components_

latent_var_r = pd.DataFrame(pca.explained_variance_ratio_[:,np.newaxis], columns=['var_r'])
latent_var_r.index = [f'pc{i}' for i in range(len(pcs))]

latent_pc = pd.DataFrame(pca.components_, columns=[f'lt{i}' for i in range(encode_dim)])
latent_pc.index = [f'pc{i}' for i in range(len(pcs))]

display(
    pd.concat([latent_pc, latent_var_r],axis=1).style\
        .background_gradient(cmap='bwr_r', subset=[f'lt{i}' for i in range(encode_dim)], axis=0)\
        .background_gradient(cmap='Blues', subset=['var_r'], axis=0)\
        .format('{:.3}')
)

In [None]:
pc_df_plot = pd.concat([df[['nid', 'name']+STATS+SIZES+TYPES+['ev_total', 'legendary']], 
                        pd.DataFrame(pca.transform(normalize(latent_df)), 
                                     columns=[f'pc{i}' for i in range(len(pcs))],
                                     index=df.index)],
                        axis=1).copy()

pc_df_plot.legendary[pc_df_plot.legendary == ''] = 'None'

pcs_str = [f'pc{i}' for i in range(len(pcs))]+STATS+SIZES
pc_x_dropdown = widgets.Dropdown(options=pcs_str, value=pcs_str[0])
pc_y_dropdown = widgets.Dropdown(options=pcs_str, value=pcs_str[1])
class_dropdown = widgets.Dropdown(options=['type_1', 'type_2', 'legendary'], value='type_1')

_ = widgets.interact(show_pcs_fig(pc_df_plot), x_axis=pc_x_dropdown, y_axis=pc_y_dropdown, color=class_dropdown)

In [None]:
import random

def pca_components_to_df(pca_latent_in):
    latent_in = rev_normalize(pca.inverse_transform(pca_latent_in), latent_df)[np.newaxis, :]
    return arrays_to_df(*decoder_model.predict(latent_in))

pca_latent_sliders = [widgets.FloatSlider(value=0, min=-4.0, max=4.0, step=0.01,
                                          description=f'pc{i}:', orientation='vertical', continuous_update=False, 
                                          readout=True, readout_format='.2f', ) for i in range(encode_dim)]
pca_latent_hbox = widgets.HBox(pca_latent_sliders)

df_out = widgets.Output(layout={'border': '1px solid black', 'height': '80px'})


def on_slider_update(change):
    df_out.clear_output()
    slider_values = [slider.value for slider in pca_latent_sliders]
    df_gen = pca_components_to_df(slider_values)
    df_gen.insert(6, 'bst', df_gen[STATS].sum(axis=1))
    
    with df_out: 
        display(df_gen.style.format('{:.1f}', subset=SIZES))
        
for slider in pca_latent_sliders:
    slider.observe(on_slider_update, names='value')
    

randomize_button = widgets.Button(description='Randomize')

def on_click_randomize(b):
    for slider in pca_latent_sliders:
        rand_norm = random.gauss(0, 1.0)
        slider.value = round(min(4.0, max(-4.0, rand_norm)), 2)
        
randomize_button.on_click(on_click_randomize)   


set_as_button = widgets.Button(description='Set as: ')
set_as_text = widgets.Text(value='',
                           placeholder='Enter name', disabled=False)

def on_click_set_as(b):
    if set_as_text.value not in list(pc_df_plot.name):
        return
    pcs_values = list(np.asarray(pc_df_plot[pc_df_plot.name == set_as_text.value][[f'pc{i}' for i in range(encode_dim)]]))[0]
    for slider, value in zip(pca_latent_sliders, pcs_values):
        slider.value = round(min(4.0, max(-4.0, value)), 2)
        
set_as_button.on_click(on_click_set_as)

buttons_hbox = widgets.HBox([randomize_button, set_as_button, set_as_text])

on_slider_update(_)
display(pca_latent_hbox, buttons_hbox, df_out)