In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix
from pytrends.request import TrendReq
import time
import math
pytrend = TrendReq(hl='en-US', tz=360)

In [2]:
#import PokemonData using pandas
#including popularity of each pokemon found in a reddit thread
pokemonData=pd.read_csv('SourceData/PokemonDataV2 - AllGenerations.csv', header = 0)

pokemonDataUnmodified = []

with open("SourceData/PokemonDataV2 - AllGenerations.csv", "r") as data_file:
    for row in data_file:
        tempRow = row.strip()
        tempRow = tempRow.split(",")
        pokemonDataUnmodified.append(list(tempRow))

In [3]:
pokemonDataUnmodified[0:9]

[['Ndex',
  'PokÃ©mon',
  'Type1',
  'Type2',
  'Variance',
  'Generation',
  'Evolution',
  'Rarity',
  'Popularity',
  'NewSnap'],
 ['1', 'Bulbasaur', 'Grass', 'Poison', '', '1', '1', 'Starter', '710', '1'],
 ['2', 'Ivysaur', 'Grass', 'Poison', '', '1', '2', 'Starter', '83', '0'],
 ['3', 'Venusaur', 'Grass', 'Poison', '', '1', '3', 'Starter', '127', '1'],
 ['4', 'Charmander', 'Fire', 'Fire', '', '1', '1', 'Starter', '374', '1'],
 ['5', 'Charmeleon', 'Fire', 'Fire', '', '1', '2', 'Starter', '70', '0'],
 ['6', 'Charizard', 'Fire', 'Flying', '', '1', '3', 'Starter', '1107', '1'],
 ['7', 'Squirtle', 'Water', 'Water', '', '1', '1', 'Starter', '523', '1'],
 ['8', 'Wartortle', 'Water', 'Water', '', '1', '2', 'Starter', '133', '0']]

In [4]:
#delete row with headers

del pokemonDataUnmodified[0]
pokemonDataUnmodified[0:9]

[['1', 'Bulbasaur', 'Grass', 'Poison', '', '1', '1', 'Starter', '710', '1'],
 ['2', 'Ivysaur', 'Grass', 'Poison', '', '1', '2', 'Starter', '83', '0'],
 ['3', 'Venusaur', 'Grass', 'Poison', '', '1', '3', 'Starter', '127', '1'],
 ['4', 'Charmander', 'Fire', 'Fire', '', '1', '1', 'Starter', '374', '1'],
 ['5', 'Charmeleon', 'Fire', 'Fire', '', '1', '2', 'Starter', '70', '0'],
 ['6', 'Charizard', 'Fire', 'Flying', '', '1', '3', 'Starter', '1107', '1'],
 ['7', 'Squirtle', 'Water', 'Water', '', '1', '1', 'Starter', '523', '1'],
 ['8', 'Wartortle', 'Water', 'Water', '', '1', '2', 'Starter', '133', '0'],
 ['9', 'Blastoise', 'Water', 'Water', '', '1', '3', 'Starter', '410', '1']]

In [5]:
#remove columns that will not be used. name isn't needed and Ndex is a primary key (more or less)
pokemonData = pokemonData.drop(['Pokémon'], axis=1) 
pokemonData = pokemonData.drop(['Ndex'], axis=1) 

#change data in each column to include what column they are in
#this will be VERY useful later on
pokemonData['Type1'] = 'Type1_' + pokemonData['Type1']
pokemonData['Type2'] = 'Type2_' + pokemonData['Type2']
pokemonData['Variance'] = 'Variance_' + pokemonData['Variance']
pokemonData['Generation'] = 'Generation_' + pokemonData['Generation'].astype(str)
pokemonData['Evolution'] = 'Evolution_' + pokemonData['Evolution'].astype(str)
pokemonData['Rarity'] = 'Rarity_' + pokemonData['Rarity']

#change the NewSnap column to 0/1, only include initially included in game (no DLC)
def newSnapCheck(data):
  if data['NewSnap'] == 1:
    return 1
  else:
    return 0

#fill in nan values in popularity because decision tree needs values on all spots. going with zero. 
def popularityCheck(data):
  if math.isnan(data['Popularity']):
    return 0
  else:
    return data['Popularity']

pokemonData['NewSnap'] = pokemonData.apply(newSnapCheck, axis=1)
pokemonData['Popularity'] = pokemonData.apply(popularityCheck, axis=1)

In [6]:
#do one hot encoding (aka dummy variables in pandas) on categorical columns

type1_occurance = pokemonData['Type1']
type1_occurance = pd.Series(type1_occurance)
type1_dummy = pd.get_dummies(type1_occurance)

type2_occurance = pokemonData['Type2']
type2_occurance = pd.Series(type2_occurance)
type2_dummy = pd.get_dummies(type2_occurance)

variance_occurance = pokemonData['Variance']
variance_occurance = pd.Series(variance_occurance)
variance_dummy = pd.get_dummies(variance_occurance)

generation_occurance = pokemonData['Generation']
generation_occurance = pd.Series(generation_occurance)
generation_dummy = pd.get_dummies(generation_occurance)

evolution_occurance = pokemonData['Evolution']
evolution_occurance = pd.Series(evolution_occurance)
evolution_dummy = pd.get_dummies(evolution_occurance)

rarity_occurance = pokemonData['Rarity']
rarity_occurance = pd.Series(rarity_occurance)
rarity_dummy = pd.get_dummies(rarity_occurance)

In [7]:
#attach the dummy variables
pokemonData = pd.concat([pokemonData, type1_dummy], axis=1)
pokemonData = pd.concat([pokemonData, type2_dummy], axis=1)
pokemonData = pd.concat([pokemonData, variance_dummy], axis=1)
pokemonData = pd.concat([pokemonData, generation_dummy], axis=1)
pokemonData = pd.concat([pokemonData, evolution_dummy], axis=1)
pokemonData = pd.concat([pokemonData, rarity_dummy], axis=1)

#remove the categorical columns
pokemonData = pokemonData.drop(['Type1'], axis=1) 
pokemonData = pokemonData.drop(['Type2'], axis=1) 
pokemonData = pokemonData.drop(['Variance'], axis=1) 
pokemonData = pokemonData.drop(['Generation'], axis=1) 
pokemonData = pokemonData.drop(['Evolution'], axis=1) 
pokemonData = pokemonData.drop(['Rarity'], axis=1) 

In [8]:
pokemonData.head(9)

Unnamed: 0,Popularity,NewSnap,Type1_Bug,Type1_Dark,Type1_Dragon,Type1_Electric,Type1_Fairy,Type1_Fighting,Type1_Fire,Type1_Flying,...,Evolution_0,Evolution_1,Evolution_2,Evolution_3,Rarity_Ancient,Rarity_Legendary,Rarity_Mythical,Rarity_Standard,Rarity_Starter,Rarity_Sub-Legendary
0,710.0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,83.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,127.0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,374.0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,70.0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
5,1107.0,1,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
6,523.0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
7,133.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
8,410.0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [9]:
#separate the NewSnap column to test for it
y = pokemonData['NewSnap']
X = pokemonData.drop(['NewSnap'], axis=1)

In [10]:
#split the data for training/testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)

In [11]:
#check to make sure the columns look clean in training
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 670 entries, 887 to 132
Data columns (total 57 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Popularity            670 non-null    float64
 1   Type1_Bug             670 non-null    uint8  
 2   Type1_Dark            670 non-null    uint8  
 3   Type1_Dragon          670 non-null    uint8  
 4   Type1_Electric        670 non-null    uint8  
 5   Type1_Fairy           670 non-null    uint8  
 6   Type1_Fighting        670 non-null    uint8  
 7   Type1_Fire            670 non-null    uint8  
 8   Type1_Flying          670 non-null    uint8  
 9   Type1_Ghost           670 non-null    uint8  
 10  Type1_Grass           670 non-null    uint8  
 11  Type1_Ground          670 non-null    uint8  
 12  Type1_Ice             670 non-null    uint8  
 13  Type1_Normal          670 non-null    uint8  
 14  Type1_Poison          670 non-null    uint8  
 15  Type1_Psychic        

In [12]:
#check to make sure the columns look clean in testing
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 288 entries, 256 to 646
Data columns (total 57 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Popularity            288 non-null    float64
 1   Type1_Bug             288 non-null    uint8  
 2   Type1_Dark            288 non-null    uint8  
 3   Type1_Dragon          288 non-null    uint8  
 4   Type1_Electric        288 non-null    uint8  
 5   Type1_Fairy           288 non-null    uint8  
 6   Type1_Fighting        288 non-null    uint8  
 7   Type1_Fire            288 non-null    uint8  
 8   Type1_Flying          288 non-null    uint8  
 9   Type1_Ghost           288 non-null    uint8  
 10  Type1_Grass           288 non-null    uint8  
 11  Type1_Ground          288 non-null    uint8  
 12  Type1_Ice             288 non-null    uint8  
 13  Type1_Normal          288 non-null    uint8  
 14  Type1_Poison          288 non-null    uint8  
 15  Type1_Psychic        

There were 27 instances where popularity could not be found. This is because the Reddit poll was taken when generation 7 was take in June 2019 while generation 8 was released in November 2019. The Pokemon snap remake included generation 8. Luckily this only makes up for 32 of the 898 Pokemon categories, roughly 3.6% of the overall data.

In [13]:
#do a decision tree classifier based on the training sets
classifier=DecisionTreeClassifier()
classifier=classifier.fit(X_train,y_train)

In [14]:
#use the created prediction model on the testing set
predictions = classifier.predict(X_test)

In [15]:
#check results of the decision tree
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.80      0.81      0.80       226
           1       0.28      0.27      0.28        62

    accuracy                           0.69       288
   macro avg       0.54      0.54      0.54       288
weighted avg       0.69      0.69      0.69       288



In [16]:
#get confusion matrix for counts
print(confusion_matrix(y_test,predictions))

[[182  44]
 [ 45  17]]


In [17]:
#find importance based off classifier
dt_fi = pd.DataFrame(classifier.feature_importances_)
names = pd.DataFrame(list(X.columns))
df_feat_imp = pd.concat([dt_fi, names], axis = 1)

df_feat_imp.columns = ['Importance', 'Features']
df_feat_imp.sort_values('Importance', ascending = False)

Unnamed: 0,Importance,Features
0,0.346158,Popularity
48,0.062252,Evolution_1
50,0.040029,Evolution_3
26,0.030516,Type2_Flying
12,0.029559,Type1_Ice
41,0.028878,Generation_3
54,0.025531,Rarity_Standard
43,0.024432,Generation_5
16,0.023241,Type1_Rock
45,0.021293,Generation_7


Adding the popularity of the pokemon results with a model that is roughly the same as doing a decision tree without it. The Importance figure above shows the how much more it influenced the chance of a Pokemon being in Pokemon snap remake. This is with the 3.6% of Pokemon not included in the poll. The results of this are probably skewed due to some Pokemon having various regional forms. To further improve this model, I could try showing distinct popularity of each Pokemon, but that would require getting rid of unique type combinations.