## Predicting Pokemon Type Given Their Name with GLOVE

# Exploring the Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
df = pd.read_csv('/kaggle/input/pokemon/Pokemon.csv')
df.columns

In [None]:
df = df[['Name','Type 1']]

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
x = np.arange(len(df['Type 1'].unique()))
plt.bar(x, df['Type 1'].value_counts())
plt.show()

df['Type 1'].value_counts()

## Making Naïve Predictions

So if we only predicted "water", our accuracy would be : 

In [None]:
print(df['Type 1'].value_counts()[0]/df.shape[0])

I calculate TF.IDF

In [None]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'

df['Name'] = df['Name'].apply(lambda s:s.lower())

def apparition(s,c):
    r = 0
    for x in s:
        if x==c: r+=1
    return r

apparitions = []
for i,c in enumerate(alphabet):
    df[str(i)] = df['Name'].apply(apparition, args=c)
    apparitions.append(sum(df[str(i)]))

letters = sum(apparitions)

for i,c in enumerate(alphabet):
    tf = df[str(i)]/df['Name'].apply(len)
    idf = letters/apparitions[i]
    df['tfidf'+str(i)] = tf*idf
df.head()

Prediction using logistic regression

In [None]:
X = df[['tfidf'+str(i) for i,c in enumerate(alphabet)]]
y = df['Type 1']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,
                                                  random_state=1)

In [None]:
reg = LogisticRegression(max_iter = 10000, C=0.1)

In [None]:
reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = reg.predict(X_val)
y_pred_train = reg.predict(X_train)
print(accuracy_score(y_pred_train,y_train))
print(accuracy_score(y_pred,y_val))

That's not much better than only gessing "water". Let's try to do something better by using a corpus of text, and it seems to be hard to reduce overfitting.

# Using GLOVE to make predictions using a LSTM trained to recognize synonyms of the types (Unsuccessful)

In [None]:
import re

In [None]:
import gensim.downloader as api
glove = api.load("glove-wiki-gigaword-100")
bests = glove.most_similar("fire", topn= 5)
print(bests)

In [None]:

bests = glove.most_similar("fire", topn= 5)
print(' '.join([str(x[0])+' '+str(x[1])+'\n' for x in bests]))

In [None]:
bests = glove.most_similar("flying", topn= 500)
print(' '.join([str(x[0]) for x in bests]))

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, LSTM, Input

In [None]:
vocab_size = 27
pk_types = len(df['Type 1'].unique())
max_name_length = df['Name'].map(len).max()

In [None]:
import tensorflow as tf

In [None]:
'''tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():'''
inputs = Input(shape=(max_name_length,vocab_size))
X = LSTM(32, return_sequences=True, recurrent_dropout = 0.3 , dropout = 0.3)(inputs)
X = LSTM(32, recurrent_dropout = 0.2 , dropout = 0.2)(X)
X = Dense(pk_types, activation='softmax')(X)
model = Model(inputs=inputs, outputs=X)

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics = ['accuracy'])

In [None]:
types = list(df['Type 1'].unique())
print(types)

In [None]:
alphabet = '#abcdefghijklmnopqrstuvwxyz'

In [None]:
def clean(s):
    return re.sub(r'[^a-z]','', s.lower())

def left_pad_and_clean(s):
    s = clean(s)
    if len(s)>=max_name_length:
        return s[:max_name_length]
    else:
        while len(s)<max_name_length:
            s = '#'+s
        return s

In [None]:
print(left_pad_and_clean('  heyé--'))
print(clean('dej JF,'))

In [None]:
def str_to_vec(s):
    s = left_pad_and_clean(s)
    vec = np.zeros(shape=(max_name_length,vocab_size))
    for i,c in enumerate(s):
        if c not in alphabet:print(c)
        vec[i,alphabet.index(c)] = 1
    return vec

In [None]:
str_to_vec('azerty').shape

In [None]:
n = 500
#subtask_df = pd.DataFrame(columns=['X','y'])
N = n*len(types)
stX = np.zeros(shape=(N,max_name_length,vocab_size))
stXweight = np.zeros(shape=(N,1))
sty = np.zeros(shape=(N,len(types)))
act_i = 0
for j,t in enumerate(types):
    ms = glove.most_similar(clean(t), topn=n-1)
    close_words = [clean(str(x[0])) for x in ms]
    close_words_w = [x[1] for x in ms]
    close_words.append(clean(t))
    close_words_w.append(1)
    for i,w in enumerate(close_words):
        vec = str_to_vec(w)
        stX[act_i,:,:] = vec
        stXweight[act_i,0] = close_words_w[i]
        sty[act_i,j] = 1
        act_i += 1
        
        #subtask_df = subtask_df.append({'X': str_to_vec(w),'y':str_to_vec(cleant)}, ignore_index=True)
    


In [None]:
print(stX.shape, sty.shape)

In [None]:
from sklearn.model_selection import train_test_split
stX_train, stX_val,stXweight_train,stXweight_val, sty_train, sty_val =\
    train_test_split(stX, stXweight,sty,shuffle=True,\
                     test_size=0.1, random_state=1)

In [None]:
model.fit(stX_train, sty_train,sample_weight=stXweight_train,batch_size=128,\
          epochs=20, validation_data = (stX_val, sty_val,stXweight_val),verbose=1)

In [None]:
r = model.evaluate(stX_val, sty_val,sample_weight=stXweight_val)
print(r[1])
r = model.evaluate(stX_train, sty_train,sample_weight=stXweight_train)
print(r[1])

A lot of overfitting, but this might not be a problem. I want the model to "remember" the words associated with water. But I still want the model to generalize to imaginary words close to the original concept. Let's see if that's the case.

## Direct prediction

In [None]:
N = df.shape[0]
diX = np.zeros(shape=(N,max_name_length,vocab_size))
diy = np.zeros(shape=(N,len(types)))
for index, row in df.iterrows():
    diX[index,:,:] = str_to_vec(row['Name'])
    diy[index,types.index(row['Type 1'])] = 1

In [None]:
print(diX.shape)
r = model.evaluate(diX, diy)
print(r[1])

Wut :'(

In [None]:
ypred = model.predict(diX)
ypred_words = model.predict(stX_val)

In [None]:
def amax(array):
    return array.argmax()

b = np.apply_along_axis(amax, 1, ypred_words)
plt.hist(b, alpha = 0.5)
a = np.apply_along_axis(amax, 1, ypred)
plt.hist(a, alpha = 0.5)

Pokemons have only psychic names...

## Indirect Prediction

I fit pokemon names to the predicted types

In [None]:
X_of_predictor = model.predict(diX)
y_of_predictor = np.apply_along_axis(lambda a:a.argmax(), 1, diy)

In [None]:
plt.hist(y_of_predictor, bins = len(types))

In [None]:
idiX_train, idiX_val, idiy_train, idiy_val =\
    train_test_split(X_of_predictor,y_of_predictor,test_size = 0.33,\
                     shuffle=True, random_state=1)

In [None]:
predictor = LogisticRegression()

In [None]:
predictor.fit(idiX_train, idiy_train)

In [None]:
ypred = predictor.predict(idiX_val)

In [None]:
idiy_val

In [None]:
ypred

In [None]:
#ypred_classes = np.apply_along_axis(lambda a:a.argmax(), 1, ypred)
#plt.hist(ypred,alpha=0.5)
plt.hist([ypred,idiy_val], bins=len(types))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print(accuracy_score(ypred, idiy_val))

worse than random :'(((((

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(idiy_val, ypred)
plt.imshow(np.log(cm))

# Using GLOVE to make predictions by counting the number of subwords of synonyms of the types inside Pokemon names

I try using the well-know fact that fire-related word are used in the name of a pokemon (for example "Typhlosion" contains a part of "explosion", which is in the set of words close too "fire"). So I look for matches.

In [None]:
# For which values of k a k-uplet will be looked for in the pokemon name
char_range = [4,5,6,7] 

## Preprocessing

In [None]:
scores = np.zeros(shape=(df.shape[0], len(types)*len(char_range)))

name_s = {}

for index, row in df.iterrows():
    name = clean(row['Name'])
    
    for k in char_range:
        for i in range(len(name)-k):
            sub_name = name[i:i+k]
            if sub_name not in name_s:
                name_s[sub_name] = []
            name_s[sub_name].append(index)
    
for j,t in enumerate(types):
    ms = glove.most_similar(clean(t), topn=n-1)
    close_words = [(clean(str(x[0])),x[1]) for x in ms]
    close_words.append((clean(t),1))

    for ind,k in enumerate(char_range):
        for w,s in close_words:
            for i in range(len(w)-k):
                sub_word = w[i:i+k]
                if sub_word in name_s:
                    pokemons_containing_sw = name_s[sub_word]
                    for index in pokemons_containing_sw:
                        scores[index,j*len(char_range)+ind] += s
            


    

In [None]:

plt.imshow(scores[171,:].reshape((len(types),len(char_range))))

In [None]:
def inv_avg(x):
    return 0 if x.mean() == 0 else 1/x.mean()
inverse_averages = np.apply_along_axis(inv_avg, 0, scores)

In [None]:
plt.imshow(inverse_averages.reshape((len(types),len(char_range))))

## Fitting a simple model

In [None]:
freqX = inverse_averages*scores
freqy = y_of_predictor
names = list(df['Name'])

freqX_train, freqX_val, freqy_train, freqy_val, names_train, names_val =\
    train_test_split(freqX,freqy,names,test_size = 0.33,\
                     shuffle=True, random_state=1)

In [None]:
freq_reg = LogisticRegression(max_iter = 10000)

In [None]:
freq_reg.fit(freqX_train, freqy_train)

In [None]:
print(accuracy_score(freqy_val, freq_reg.predict(freqX_val)))

## Finding the best model and hyperparameters

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
freq_models = [LogisticRegression(C = 1, max_iter = 10000), 
               LogisticRegression(C = 100, max_iter = 10000), 
               SVC(), 
               SVC(kernel='linear'), 
               DecisionTreeClassifier(max_depth=6)]

for m in freq_models:
    m.fit(freqX_train, freqy_train)
    print(accuracy_score(freqy_val, m.predict(freqX_val)))

In [None]:
for p in [0.5,1,2,3,4]:
    print('using p =',p)
    for n in [200,250,300,350,400]:
        print('using n =',n)

        scores = np.zeros(shape=(df.shape[0], len(types)*len(char_range)))

        name_s = {}

        for index, row in df.iterrows():
            name = clean(row['Name'])

            for k in char_range:
                for i in range(len(name)-k):
                    sub_name = name[i:i+k]
                    if sub_name not in name_s:
                        name_s[sub_name] = []
                    name_s[sub_name].append(index)

        for j,t in enumerate(types):
            ms = glove.most_similar(clean(t), topn=n-1)
            close_words = [(clean(str(x[0])),x[1]) for x in ms]
            close_words.append((clean(t),1))

            for ind,k in enumerate(char_range):
                for w,s in close_words:
                    for i in range(len(w)-k):
                        sub_word = w[i:i+k]
                        if sub_word in name_s:
                            pokemons_containing_sw = name_s[sub_word]
                            for index in pokemons_containing_sw:
                                scores[index,j*len(char_range)+ind] += s**p

        inverse_averages = np.apply_along_axis(inv_avg, 0, scores)

        freqX = inverse_averages*scores
        freqy = y_of_predictor
        names = list(df['Name'])

        freqX_train, freqX_val, freqy_train, freqy_val, names_train, names_val =\
            train_test_split(freqX,freqy,names,test_size = 0.33,\
                             shuffle=True, random_state=1)

        freq_model = LogisticRegression(penalty='none', max_iter = 10000)

        freq_model.fit(freqX_train, freqy_train)
        print(accuracy_score(freqy_val, freq_model.predict(freqX_val)))

Fitting the best model

In [None]:
n = 300

scores = np.zeros(shape=(df.shape[0], len(types)*len(char_range)))

name_s = {}

for index, row in df.iterrows():
    name = clean(row['Name'])

    for k in char_range:
        for i in range(len(name)-k):
            sub_name = name[i:i+k]
            if sub_name not in name_s:
                name_s[sub_name] = []
            name_s[sub_name].append(index)

for j,t in enumerate(types):
    ms = glove.most_similar(clean(t), topn=n-1)
    close_words = [(clean(str(x[0])),x[1]) for x in ms]
    close_words.append((clean(t),1))

    for ind,k in enumerate(char_range):
        for w,s in close_words:
            for i in range(len(w)-k):
                sub_word = w[i:i+k]
                if sub_word in name_s:
                    pokemons_containing_sw = name_s[sub_word]
                    for index in pokemons_containing_sw:
                        scores[index,j*len(char_range)+ind] += s**2

inverse_averages = np.apply_along_axis(inv_avg, 0, scores)

freqX = inverse_averages*scores
freqy = y_of_predictor
names = list(df['Name'])

freqX_train, freqX_val, freqy_train, freqy_val, names_train, names_val =\
    train_test_split(freqX,freqy,names,test_size = 0.33,\
                     shuffle=True, random_state=1)

freq_model = LogisticRegression(penalty='none', max_iter = 10000)

freq_model.fit(freqX_train, freqy_train)
print('Accuracy : ',accuracy_score(freqy_val, freq_model.predict(freqX_val)))

So our best guess accuracy can be achieved using this method, and is arround 22%.

## Error Analysis

In [None]:
print('Accuracy over the training set :', accuracy_score(freqy_train, freq_model.predict(freqX_train)))

There is quite a lot of overfitting, but all my attempts at reducing it only resulted in less accuracy on the validation set.

In [None]:
preds = freq_model.predict(freqX_val)
cm = confusion_matrix(freqy_val, preds)
plt.imshow(cm)

In [None]:
plt.hist([preds, freqy_val], bins=len(types))
print(types)

It seems to love the type "Normal".

# Final prediction on the validation set

The prediction made by the algorithm :

In [None]:
pred_df = pd.DataFrame(columns=['Name', 'Predicted type', 'Actual type'])

for i,x in enumerate(preds):
    name = names_val[i]
    pred_df = pred_df.append({'Name':name,
                              'Predicted type':types[x],
                              'Actual type': types[freqy_val[i]]}, ignore_index = True)

pd.set_option('display.max_rows', None)
print(pred_df)