In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import re
from sklearn.metrics import f1_score, confusion_matrix
import xgboost as xgb


In [None]:
pokemon = pd.read_csv('../input/pokemon/Pokemon.csv')
pokemon.head()

Let's practice training models!

First up, let's try and predict which Pokemon should be legendary. Can we get all the current ones? Which Pokemon are mistaken as legendaries (type I errors)? Which legendaries seem a little week (type II errors)?

In [None]:
pokemon.Legendary.value_counts()

## Feature Engineering
First we should add some features. Specifically:
* Types
* Number of Types
* Mega (this is important as it can otherwise lift stats)

In [None]:
# Creat column for each type.
for t in set(pokemon['Type 1'].values.tolist()):
    pokemon[t] = (pokemon[['Type 1', 'Type 2']]==t).any(axis=1).fillna(0)
# Get total number of types.
pokemon['Number of Types'] = (~pokemon[['Type 1', 'Type 2']].isnull()).sum(axis=1)
# Drop types columns.
pokemon.drop(['Type 1', 'Type 2'], axis=1, inplace=True)
# Get Mega type
pokemon['Mega'] = pokemon.Name.apply(lambda x: re.search('^[a-zA-Z]+Mega [a-zA-z]+', x) is not None)
# Set index.
pokemon.set_index(['#', 'Name'], inplace=True)

In [None]:
pokemon.head()

## Preprocessing
We should split our data out.

We're going to use decision trees which are not sensitive to scaling, so we won't perform any scaling on the input features. https://stats.stackexchange.com/questions/353462/what-are-the-implications-of-scaling-the-features-to-xgboost

In [None]:
y = pokemon.Legendary
X = pokemon.drop(['Legendary'], axis=1)
X_cols = pokemon.drop(['Legendary'], axis=1).columns

y_train = pokemon.query('Generation < 6').Legendary
X_train = pokemon.query('Generation < 6').drop(['Legendary'], axis=1)

y_test = pokemon.query('Generation == 6').Legendary
X_test = pokemon.query('Generation == 6').drop(['Legendary'], axis=1)

## Training
Now we can train a classifier.

In [None]:
# xg_reg = xgb.XGBRegressor(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg = xgb.XGBRegressor(objective ='binary:logistic')


xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

#print(f1_score(y_test, preds))

In [None]:
# What's the best cutoff?
from numpy import linspace
for i in linspace(0, 1, 11):
    print('Cutoff: {0:.1f}, F1 Score: {1:.3f}'.format(i, f1_score(y_test, preds > i)))

Model trained! 

What were the five most important attributes of a legendary type?

In [None]:
pokemon['Legendary Predicted XGB'] = xg_reg.predict(X)>=0.2

pokemon['Legendary Predicted XGB'].value_counts()


What are the most important features in predicting a legendary type?

In [None]:
xgb.plot_importance(xg_reg)

As you may have expected, the most important feature is the overall stat performance. It may be a little more surprising to find that the HP is the most important stat, indicating that legendary pokemon are, on average, faster than regular Pokemon. If we look at types, Dragon is the most legendary type which reflects the games choice to create dragons as the most powerful beasts.

Now, what Pokemon did our model mistake as legendaries? And what legendaries did it miss?

It's surprising that the model did not more heavily weight the Mega feature given its ability to predict non-Legendary status (only 6 legendaries out of a total of 48 Mega types).

In [None]:
pokemon[pokemon.Mega==True].Legendary.value_counts()

In [None]:
confusion_matrix(pokemon['Legendary Predicted XGB'], pokemon['Legendary'])

The model did a good job at predicting the legendary pokemon correctly, with no Type 1 errors. It did, however, mistakenly choose 14 regular pokemon, let's check these out.

In [None]:
pokemon[(pokemon['Legendary']!=pokemon['Legendary Predicted XGB'])&(pokemon.Legendary==False)]

Seems there is still one Mega Pokemon classified despite our feature engineering, but this is likely because the base form is also incorrectly classified.

We are pickup up all the pseudo-legendary Pokemon as specified here: https://bulbapedia.bulbagarden.net/wiki/Pseudo-legendary_Pok%C3%A9mon. We also see many of the mythical Pokemon such as Celebi, Mew and Manapyh. https://bulbapedia.bulbagarden.net/wiki/Mythical_Pok%C3%A9mon

Cresselia is actually a legendary type, so it seems the dataset is incorrect here. https://bulbapedia.bulbagarden.net/wiki/Cresselia_(Pok%C3%A9mon)

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(dpi=180)
ax = plt.subplot(1,1,1)
xgb.plot_tree(xg_reg, ax = ax)#, feature_names=list(X_cols))
 
plt.tight_layout()
#plt.savefig("tree_structure.pdf")
plt.show()
