In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate

from sklearn.tree import DecisionTreeClassifier


In [None]:
# 讀入資料集
df_pokemon = pd.read_csv('/kaggle/input/complete-pokemon-dataset-updated-090420/pokedex_(Update_05.20).csv')

In [None]:
# 查看資料欄位
print(df_pokemon.columns)

In [None]:
# 查看資料欄位中是否有null值
print(df_pokemon.isnull().sum())

In [None]:
# 將有可能需要的欄位做缺失值處理
# 字串類別補'', 數值類別補0

df_pokemon['type_2'] = df_pokemon['type_2'].fillna('')
df_pokemon['weight_kg'] = df_pokemon['weight_kg'].fillna(0)
df_pokemon['ability_1'] = df_pokemon['ability_1'].fillna('')
df_pokemon['ability_2'] = df_pokemon['ability_2'].fillna('')
df_pokemon['ability_hidden'] = df_pokemon['ability_hidden'].fillna('')
df_pokemon['catch_rate'] = df_pokemon['catch_rate'].fillna(0)
df_pokemon['base_friendship'] = df_pokemon['base_friendship'].fillna(0)
df_pokemon['base_experience'] = df_pokemon['base_experience'].fillna(0)
df_pokemon['growth_rate'] = df_pokemon['growth_rate'].fillna('')
df_pokemon['egg_type_1'] = df_pokemon['egg_type_1'].fillna('')
df_pokemon['egg_type_2'] = df_pokemon['egg_type_2'].fillna('')
df_pokemon['percentage_male'] = df_pokemon['percentage_male'].fillna(0)
df_pokemon['egg_cycles'] = df_pokemon['egg_cycles'].fillna(0)

In [None]:
# 對type_1做one-hot-encoding
df_onehot_type_1 = pd.get_dummies(df_pokemon['type_1'])

In [None]:
df_onehot_type_1

In [None]:
# 找出所有數值類型的欄位
df_corr_column = df_pokemon[['height_m', 'weight_kg', 
       'total_points', 'hp', 'attack', 'defense', 'base_friendship', 'base_experience',
       'sp_attack', 'sp_defense', 'speed', 'catch_rate', 'percentage_male', 'egg_cycles']]

In [None]:
df_corr_column

In [None]:
# 串接數值類型的欄位及做完one-hot-encoding的type_1欄位
df_corr_column = pd.concat([df_corr_column, df_onehot_type_1], axis=1)

In [None]:
df_corr_column

In [None]:
df_corr_column.corr().columns.values

In [None]:
# 將欄位做相關係數
# 各別把'height_m', 'weight_kg', 'total_points', 'hp', 'attack', 'defense',
# 'base_friendship', 'base_experience', 'sp_attack', 'sp_defense','speed', 'catch_rate', 'percentage_male', 'egg_cycles'
# 這些欄位對於type_1各欄位的相關係數加總
# ex. 加總 'height_m' vs. 'Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting',
#     'Fire', 'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison', 
#     'Psychic', 'Rock', 'Steel', 'Water' 的相關係數
# 藉此得知這些欄位 vs. type_1的相關係數總和

sum_of_corr = []

for i in range(0, 14) :
    sum_of_corr.append(np.sum(df_corr_column.corr().values[i][15:-1]))

In [None]:
sum_of_corr

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 將各欄位與type_1的相關係數總和
# 以熱力圖表示

column_name = ['height_m', 'weight_kg', 
       'total_points', 'hp', 'attack', 'defense', 'base_friendship', 'base_experience',
       'sp_attack', 'sp_defense', 'speed', 'catch_rate', 'percentage_male', 'egg_cycles']

df_cm = pd.DataFrame(sum_of_corr, index = column_name, columns = ['type_1'])
plt.figure(figsize = (30,30))
sns.heatmap(df_cm, annot=True)



In [None]:

# 對generation欄位做one-hot-encoding
df_generation = df_pokemon['generation']
df_generation = pd.get_dummies(df_generation)

# 挑選數值類型欄位中
# 與type_1相關係數高的欄位
# 以及'status', 'species', 'growth_rate', 'egg_type_1','egg_type_2'欄位作為特徵

df_train = df_pokemon[['status', 'species',
       'weight_kg', 'ability_1', 'total_points', 'hp', 'attack', 'defense',
       'sp_attack', 'sp_defense', 'growth_rate', 'egg_type_1',
       'egg_type_2', 'egg_cycles']]


# 將欄位做one-hot-encoding
df_train = pd.get_dummies(df_train)

# 與generation作串接
df_train = pd.concat([df_train, df_generation], axis=1)


In [None]:
df_train

In [None]:
# 對type_1做one-hot-encoding
# 當作label使用
target = pd.get_dummies(df_pokemon['type_1'])
target = np.array(target)

In [None]:
target

In [None]:
# 檢查是否有nan值存在
np.any(np.isnan(df_train.values))

In [None]:
# 檢查是否有finite值存在
np.all(np.isfinite(df_train.values))

In [None]:
# 切分30%作為資料集, 並且以隨機方式選擇
X_train, X_test, y_train, y_test = train_test_split(df_train.values, target, test_size=0.3, random_state=3000, shuffle=True)

# 設定DecisionTree, 最大深度20
clf = DecisionTreeClassifier(max_depth=20)

# 以交叉驗證作為驗證方法, 切成5份, 有做shuffle
kfold = KFold(n_splits=5, random_state=3000, shuffle=True)

# 直接fit
clf_fit = clf.fit(X_train, y_train)

# 交叉驗證結果
cv_result = cross_validate(clf, df_train.values, target, cv=kfold, scoring='accuracy', return_train_score=True)


print('fit score : ' + str(clf_fit.score(X_test, y_test)))
print('cross_validate train score : ' + str(cv_result['train_score'].mean()))
print('cross_validate test score : ' + str(cv_result['test_score'].mean()))

In [None]:
# 以圖的方式視覺化decision tree的路徑

import graphviz 
from sklearn import tree

feature_names = df_train.columns
target_names = list(set(np.array(df_pokemon['type_1'])))

dot_data = tree.export_graphviz(clf_fit, out_file=None, 
                                feature_names=feature_names,
                                class_names=target_names,  
                                filled=True, rounded=True)

graph = graphviz.Source(dot_data)  
graph.render("pokemon") # 存成pdf