In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix
from pytrends.request import TrendReq
import time
import math
pytrend = TrendReq(hl='en-US', tz=360)

In [3]:
#import PokemonData using pandas
#including popularity of each pokemon found in a reddit thread
pokemonData=pd.read_csv('SourceData/PokemonDataV2 - DistinctCheck.csv', header = 0)

pokemonDataUnmodified = []

with open("SourceData/PokemonDataV2 - DistinctCheck.csv", "r") as data_file:
    for row in data_file:
        tempRow = row.strip()
        tempRow = tempRow.split(",")
        pokemonDataUnmodified.append(list(tempRow))

In [4]:
pokemonDataUnmodified[0:9]

[['Ndex',
  'PokÃ©mon',
  'Variance',
  'Generation',
  'Evolution',
  'Rarity',
  'Popularity',
  'NewSnap'],
 ['1', 'Bulbasaur', '', '1', '1', 'Starter', '710', '1'],
 ['2', 'Ivysaur', '', '1', '2', 'Starter', '83', '0'],
 ['3', 'Venusaur', '', '1', '3', 'Starter', '127', '1'],
 ['4', 'Charmander', '', '1', '1', 'Starter', '374', '1'],
 ['5', 'Charmeleon', '', '1', '2', 'Starter', '70', '0'],
 ['6', 'Charizard', '', '1', '3', 'Starter', '1107', '1'],
 ['7', 'Squirtle', '', '1', '1', 'Starter', '523', '1'],
 ['8', 'Wartortle', '', '1', '2', 'Starter', '133', '0']]

In [5]:
#delete row with headers

del pokemonDataUnmodified[0]
pokemonDataUnmodified[0:9]

[['1', 'Bulbasaur', '', '1', '1', 'Starter', '710', '1'],
 ['2', 'Ivysaur', '', '1', '2', 'Starter', '83', '0'],
 ['3', 'Venusaur', '', '1', '3', 'Starter', '127', '1'],
 ['4', 'Charmander', '', '1', '1', 'Starter', '374', '1'],
 ['5', 'Charmeleon', '', '1', '2', 'Starter', '70', '0'],
 ['6', 'Charizard', '', '1', '3', 'Starter', '1107', '1'],
 ['7', 'Squirtle', '', '1', '1', 'Starter', '523', '1'],
 ['8', 'Wartortle', '', '1', '2', 'Starter', '133', '0'],
 ['9', 'Blastoise', '', '1', '3', 'Starter', '410', '1']]

In [6]:
#remove columns that will not be used. name isn't needed and Ndex is a primary key (more or less)
pokemonData = pokemonData.drop(['Pokémon'], axis=1) 
pokemonData = pokemonData.drop(['Ndex'], axis=1) 

#change data in each column to include what column they are in
#this will be VERY useful later on
pokemonData['Variance'] = 'Variance_' + pokemonData['Variance']
pokemonData['Generation'] = 'Generation_' + pokemonData['Generation'].astype(str)
pokemonData['Evolution'] = 'Evolution_' + pokemonData['Evolution'].astype(str)
pokemonData['Rarity'] = 'Rarity_' + pokemonData['Rarity']

#change the NewSnap column to 0/1, only include initially included in game (no DLC)
def newSnapCheck(data):
  if data['NewSnap'] == 1:
    return 1
  else:
    return 0

#fill in nan values in popularity because decision tree needs values on all spots. going with zero. 
def popularityCheck(data):
  if math.isnan(data['Popularity']):
    return 0
  else:
    return data['Popularity']

pokemonData['NewSnap'] = pokemonData.apply(newSnapCheck, axis=1)
pokemonData['Popularity'] = pokemonData.apply(popularityCheck, axis=1)

In [7]:
#do one hot encoding (aka dummy variables in pandas) on categorical columns

variance_occurance = pokemonData['Variance']
variance_occurance = pd.Series(variance_occurance)
variance_dummy = pd.get_dummies(variance_occurance)

generation_occurance = pokemonData['Generation']
generation_occurance = pd.Series(generation_occurance)
generation_dummy = pd.get_dummies(generation_occurance)

evolution_occurance = pokemonData['Evolution']
evolution_occurance = pd.Series(evolution_occurance)
evolution_dummy = pd.get_dummies(evolution_occurance)

rarity_occurance = pokemonData['Rarity']
rarity_occurance = pd.Series(rarity_occurance)
rarity_dummy = pd.get_dummies(rarity_occurance)

In [8]:
#attach the dummy variables
pokemonData = pd.concat([pokemonData, variance_dummy], axis=1)
pokemonData = pd.concat([pokemonData, generation_dummy], axis=1)
pokemonData = pd.concat([pokemonData, evolution_dummy], axis=1)
pokemonData = pd.concat([pokemonData, rarity_dummy], axis=1)

#remove the categorical columns
pokemonData = pokemonData.drop(['Variance'], axis=1) 
pokemonData = pokemonData.drop(['Generation'], axis=1) 
pokemonData = pokemonData.drop(['Evolution'], axis=1) 
pokemonData = pokemonData.drop(['Rarity'], axis=1) 

In [9]:
pokemonData.head(9)

Unnamed: 0,Popularity,NewSnap,Variance_Alohan,Variance_Galarian,Variance_Multiple,Generation_1,Generation_2,Generation_3,Generation_4,Generation_5,...,Evolution_0,Evolution_1,Evolution_2,Evolution_3,Rarity_Ancient,Rarity_Legendary,Rarity_Mythical,Rarity_Standard,Rarity_Starter,Rarity_Sub-Legendary
0,710.0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,83.0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,127.0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,374.0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,70.0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
5,1107.0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
6,523.0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
7,133.0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
8,410.0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [10]:
#separate the NewSnap column to test for it
y = pokemonData['NewSnap']
X = pokemonData.drop(['NewSnap'], axis=1)

In [11]:
#split the data for training/testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)

In [12]:
#check to make sure the columns look clean in training
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 628 entries, 246 to 132
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Popularity            628 non-null    float64
 1   Variance_Alohan       628 non-null    uint8  
 2   Variance_Galarian     628 non-null    uint8  
 3   Variance_Multiple     628 non-null    uint8  
 4   Generation_1          628 non-null    uint8  
 5   Generation_2          628 non-null    uint8  
 6   Generation_3          628 non-null    uint8  
 7   Generation_4          628 non-null    uint8  
 8   Generation_5          628 non-null    uint8  
 9   Generation_6          628 non-null    uint8  
 10  Generation_7          628 non-null    uint8  
 11  Generation_8          628 non-null    uint8  
 12  Evolution_0           628 non-null    uint8  
 13  Evolution_1           628 non-null    uint8  
 14  Evolution_2           628 non-null    uint8  
 15  Evolution_3          

In [13]:
#check to make sure the columns look clean in testing
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270 entries, 186 to 311
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Popularity            270 non-null    float64
 1   Variance_Alohan       270 non-null    uint8  
 2   Variance_Galarian     270 non-null    uint8  
 3   Variance_Multiple     270 non-null    uint8  
 4   Generation_1          270 non-null    uint8  
 5   Generation_2          270 non-null    uint8  
 6   Generation_3          270 non-null    uint8  
 7   Generation_4          270 non-null    uint8  
 8   Generation_5          270 non-null    uint8  
 9   Generation_6          270 non-null    uint8  
 10  Generation_7          270 non-null    uint8  
 11  Generation_8          270 non-null    uint8  
 12  Evolution_0           270 non-null    uint8  
 13  Evolution_1           270 non-null    uint8  
 14  Evolution_2           270 non-null    uint8  
 15  Evolution_3          

In [14]:
#do a decision tree classifier based on the training sets
classifier=DecisionTreeClassifier()
classifier=classifier.fit(X_train,y_train)

In [15]:
#use the created prediction model on the testing set
predictions = classifier.predict(X_test)

In [16]:
#check results of the decision tree
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       201
           1       0.42      0.32      0.36        69

    accuracy                           0.71       270
   macro avg       0.60      0.58      0.59       270
weighted avg       0.69      0.71      0.70       270



In [17]:
#get confusion matrix for counts
print(confusion_matrix(y_test,predictions))

[[170  31]
 [ 47  22]]


In [18]:
#find importance based off classifier
dt_fi = pd.DataFrame(classifier.feature_importances_)
names = pd.DataFrame(list(X.columns))
df_feat_imp = pd.concat([dt_fi, names], axis = 1)

df_feat_imp.columns = ['Importance', 'Features']
df_feat_imp.sort_values('Importance', ascending = False)

Unnamed: 0,Importance,Features
0,0.497335,Popularity
13,0.078698,Evolution_1
14,0.05962,Evolution_2
8,0.047413,Generation_5
6,0.037072,Generation_3
7,0.032649,Generation_4
20,0.031113,Rarity_Starter
5,0.030331,Generation_2
12,0.02785,Evolution_0
15,0.023546,Evolution_3


Removing the variant typings to get distinct Pokemon improved the accuracy by 2%. To further improve on this, there might be additional features we do not need in the model. Given the importance of Variance_Multiple was so 0.00, we can assume it can be dropped. To find what all can be dropped, I'm going to use Lasso regression to find what is has statistical significance.