In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing Data

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
data = pd.read_csv('/kaggle/input/pokemon/pokemon.csv')
data.head()

In [None]:
data.shape

In [None]:
data.info()

We have 4 columns that have nulls, lets handel them and see how to impute these nulls.

In [None]:
# check number of null values in each column
data.isna().sum()[data.isna().sum() != 0]

In [None]:
sns.displot(data['height_m'], kde=True)

In [None]:
sns.displot(data['percentage_male'], kde=True)

In [None]:
sns.displot(data['weight_kg'], kde=True)

our three numerical variables are skewed.

#### Impute the three numerical variables with median

In [None]:
data['height_m'].fillna(value=data['height_m'].median(), inplace=True)
data['percentage_male'].fillna(value=data['percentage_male'].median(), inplace=True)
data['weight_kg'].fillna(value=data['weight_kg'].median(), inplace=True)

In [None]:
# check number of null values in each column
data.isna().sum()[data.isna().sum() != 0]

In [None]:
data.isna().sum()[data.isna().sum() != 0] / len(data)

type two column has many null values, so we may drop it.

In [None]:
data.drop('type2',axis=1, inplace = True)

# Target Variable Analysis

In [None]:
sns.histplot(data['is_legendary'])

## Content Based vs Collabortaive Filtering

#### Content-Based

* Content-based filtering involves recommending items based on the attributes of the items themselves. 
* Recommendations made by content-based filters use an individual’s historical information to inform choices displayed. 
* Such recommenders look for similarities between the items to recommend options in the future.  


**Disadvantages**
* The model does not learn from transactions. It will recommend items similar to those already consumed
* There isn’t much improvement in the performance of content-based systems over time

#### Collaborative Filtering

Collaborative filtering uses the combined power of ratings provided by many users/customers to present recommendations.
1. Memory-based methods 
    - User-based collaborative filtering
    - Item-based collaborative filtering
2. Model-based methods 
    - machine learning methods to extract predictions for rating data by treating the problem as a normal machine learning problem. (NN, PCA and Clustering).

**Disadvantages**
* Cold-start for new users
* New-item problem

* In our problem, we need to know which pokemon is legendary, we can know such an information by comparing the features of every pokemon and recommend the similar ones legendaries to be legendaries.  
* To achieve this, we will use content-based appraoch on numerical variables

# Label Encoding

In [None]:
for label,content in data.items():
    if not pd.api.types.is_numeric_dtype(content):
        data[label] = data[label].astype('category')

In [None]:
for label,content in data.items():
    if pd.api.types.is_categorical_dtype(content):
        data[label] = pd.Categorical(content).codes + 1

In [None]:
data.head()

*To do*
- similar names may indicate something related to legendary

In [None]:
data.loc[data.is_legendary == 1][:20]

# Split the data

In [None]:
y = data['is_legendary'] # store the target variable value.
X = data.drop('is_legendary',axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

# Modeling

## Content-Based

In [None]:
from sklearn.metrics.pairwise import linear_kernel 

cosine_similarities = linear_kernel(data, data) # Measure Cosine Similarity between elements

In [None]:
cosine_similarities

Now lets create a function that takes an id and see whether it is a legendary or not

In [None]:
# map pokemon indecies
mapping = pd.Series(data.index,index = data['name'])

In [None]:
def recommend_pokemon(name):
    poke_index = mapping[name]
    
    #get similarity values with other pokemons
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(cosine_similarities[name]))
    # print(similarity_score)
    #sort in descending order the similarity score of pokemon inputted with all the other pokemons
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar pokemons.
    similarity_score = similarity_score[1:10] # first pokemon is the most similar because it is the same pokemon we entered almost
    
    #return pokemon names using the mapping series
    pokemon_indices = [i[0] for i in similarity_score]
    return (data['name'].iloc[pokemon_indices])

In [None]:
recommend_pokemon(793)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train,y_train)
rf.score(X_test,y_test)

In [None]:
## TO DO
# check the classification score of the RF Model
# may use f1 score
# check better tree paratemters using randomsearch or gridsearch