In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
NORMALIZED_DATA = '../../Training-baseline/'

In [3]:
metadata = pd.read_csv('metadata/pokemon.csv')

pokemon_names = [x for x in os.listdir(NORMALIZED_DATA)]

filtered_metadata = metadata[metadata.name.isin(pokemon_names)]

filtered_list = filtered_metadata.loc[:,'name'].tolist()

filtered_metadata = filtered_metadata.loc[:,['name', 'type1']]

filtered_metadata.type1 = pd.Categorical(filtered_metadata.type1)
filtered_metadata['code'] = filtered_metadata.type1.cat.codes

In [4]:
def compile_training_data_to_list():
    all_data = []
    for pokemon in os.listdir(NORMALIZED_DATA):
        all_data += [pokemon + '/' + x for x in os.listdir(NORMALIZED_DATA + pokemon)]

    results = create_annotated_dataframe(all_data)
    return results

def create_annotated_dataframe(all_data):
    base_data = {'file_name': [], 'name': [], 'label': []}
    for item in all_data:
        if len(filtered_metadata[filtered_metadata['name'].str.contains(item.split('/')[0])]):
            base_data['file_name'].append(item)
            base_data['name'].append(item.split('/')[0])
            # yes, this is a bit ugly, but we have to match with the metadata
            base_data['label'].append(
                filtered_metadata[
                    filtered_metadata['name']==(item.split('/')[0])
                    ].loc[:,'code'].tolist()[0])

    results, y_train = create_encoded_dataframe(base_data)
    return results, y_train

def create_encoded_dataframe(base_data):
    results = pd.DataFrame(base_data, columns = ['file_name', 'name', 'label'])
    labels = np.unique(results["label"])
    max_value = np.max(labels) + 1
    Y_train = np.eye(max_value)[results["label"]]
    return results, Y_train

In [5]:
encoded_data, y_train = compile_training_data_to_list()

In [6]:
# save data
df = encoded_data.copy()
for i in range(y_train.shape[1]):
    column_name = "y_train" + str(i)
    df[column_name] = y_train[:,i]
df["y_train_columns"] = np.full((y_train.shape[0],1), y_train.shape[1])
df.to_csv("./metadata/training-list.csv")

In [8]:
# read data
csv_data = pd.read_csv("./metadata/training-list.csv", index_col=0)

y_train = np.zeros((len(csv_data["y_train_columns"]),csv_data["y_train_columns"][0]))
for i in range(csv_data["y_train_columns"][0]):
    y_train[:,i] = csv_data["y_train"+str(i)]

encoded_data = csv_data.drop(csv_data.columns[(csv_data.shape[1]-csv_data["y_train_columns"][0]-1):csv_data.shape[1]], axis=1)


In [9]:
print(encoded_data)

                            file_name       name  label
0      Abomasnow/dcedzyqfojskcahp.npy  Abomasnow      9
1      Abomasnow/gqfpsmqasdqiknur.npy  Abomasnow      9
2      Abomasnow/imzcvkkckbdchpro.npy  Abomasnow      9
3      Abomasnow/kzibfmivzksykiwy.npy  Abomasnow      9
4      Abomasnow/mjtasvyoonxyilqt.npy  Abomasnow      9
...                               ...        ...    ...
12074   Zweilous/nrpzbrzmxehydoqj.npy   Zweilous      1
12075   Zweilous/qjoppeepmpyujyao.npy   Zweilous      1
12076   Zweilous/sihxufnlbmephyeq.npy   Zweilous      1
12077   Zweilous/vshewhewmkutsdlp.npy   Zweilous      1
12078   Zweilous/ytpdigaymlnyrpbd.npy   Zweilous      1

[12079 rows x 3 columns]


In [14]:
print(csv_data)

                            file_name       name  label  y_train0  y_train1  \
0      Abomasnow/dcedzyqfojskcahp.npy  Abomasnow      9       0.0       0.0   
1      Abomasnow/gqfpsmqasdqiknur.npy  Abomasnow      9       0.0       0.0   
2      Abomasnow/imzcvkkckbdchpro.npy  Abomasnow      9       0.0       0.0   
3      Abomasnow/kzibfmivzksykiwy.npy  Abomasnow      9       0.0       0.0   
4      Abomasnow/mjtasvyoonxyilqt.npy  Abomasnow      9       0.0       0.0   
...                               ...        ...    ...       ...       ...   
12074   Zweilous/nrpzbrzmxehydoqj.npy   Zweilous      1       0.0       1.0   
12075   Zweilous/qjoppeepmpyujyao.npy   Zweilous      1       0.0       1.0   
12076   Zweilous/sihxufnlbmephyeq.npy   Zweilous      1       0.0       1.0   
12077   Zweilous/vshewhewmkutsdlp.npy   Zweilous      1       0.0       1.0   
12078   Zweilous/ytpdigaymlnyrpbd.npy   Zweilous      1       0.0       1.0   

       y_train2  y_train3  y_train4  y_train5  y_tr