# Classification of legendary pokemons with Random Forest

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.model_selection import FeatureImportances


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/pokemon/Pokemon.csv')
df.head()

# Exploratory Analysis

In [None]:
df.describe()

In [None]:
sns.countplot(df['Type 1'])


In [None]:
sns.countplot(df['Type 2'])


In [None]:
sns.countplot(df['Generation'])


In [None]:
sns.countplot(df.Legendary)

In [None]:
df.replace({False:0,True:1},inplace=True)

In [None]:
df.columns

In [None]:
df.drop(columns=['#', 'Name', 'Type 1', 'Type 2', 'Total','Generation'],inplace=True)


# Metods to evaluate model

In [None]:
def matriz_confusao(real, predito):
    matriz = metrics.confusion_matrix(real,predito)
    fig, ax = plt.subplots()
    classes = df['Legendary'].unique()
    im = ax.imshow(matriz,cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(matriz.shape[1]),
           yticks=np.arange(matriz.shape[0]),
           xticklabels=classes, yticklabels=classes,
           title='Matriz de confusÃ£o',
           ylabel='True label',
           xlabel='Predicted label')
    thresh = matriz.max().mean()
    for i in range(matriz.shape[0]):
        for j in range(matriz.shape[1]):
            ax.text(j, i, format(matriz[i, j]),
                ha="center", va="center",
                color="white" if matriz[i, j] > thresh else "black")
    fig.tight_layout()
    

In [None]:
def features_importances(modelo,X,y):
    viz = FeatureImportances(modelo)
    viz.fit(X, y)
    viz.show()

# Train Model

In [None]:
X = df[['HP','Attack','Defense','Speed']]
y= df.Legendary

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2811)

print ('Set de Treino:', X_train.shape,  y_train.shape)
print ('Set de Teste:', X_test.shape,  y_test.shape)

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)


In [None]:
rfc_predict = rfc.predict(X_test)


In [None]:
print("Model recall: ",metrics.recall_score(y_test, rfc_predict,average=None)*100)
print('F1 score: ',metrics.f1_score(y_test, rfc_predict,average=None)*100)



In [None]:
matriz_confusao(y_test,rfc_predict)

In [None]:
print('Classification report: ', metrics.classification_report(y_test,rfc_predict))


In [None]:
features_importances(rfc,X_train,y_train)