In [53]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

# Load Dataset

In [30]:
data_path = '../data/jutsus.jsonlines'
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Barrier Method Formation,"Ninjutsu, Barrier Ninjutsu","This is a type of trap ninjutsu (トラップ忍術, Torap..."
2,Banshō Kokuin,"Kekkei Genkai, Ninjutsu, Dōjutsu",Madara creates a black sphere that attracts al...
3,Barrage of Gentle Fists,"Kekkei Genkai, Hiden, Taijutsu",The user delivers a series of quick blows of t...
4,Banshō Ten'in,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about the technique. For chapt...


In [31]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"
    

In [32]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [33]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Barrier Method Formation,"Ninjutsu, Barrier Ninjutsu","This is a type of trap ninjutsu (トラップ忍術, Torap...",Ninjutsu
2,Banshō Kokuin,"Kekkei Genkai, Ninjutsu, Dōjutsu",Madara creates a black sphere that attracts al...,Ninjutsu
3,Barrage of Gentle Fists,"Kekkei Genkai, Hiden, Taijutsu",The user delivers a series of quick blows of t...,Taijutsu
4,Banshō Ten'in,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about the technique. For chapt...,Ninjutsu


In [34]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2271
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [35]:
df['text'] = df['jutsu_name'] + ' ' + df['jutsu_description']
df['jutsu'] = df['jutsu_type_simplified']
df = df[['text', 'jutsu']]
df = df.dropna()

In [36]:
df.head()

Unnamed: 0,text,jutsu
0,10 Hit Combo Lars punches the opponent before ...,Taijutsu
1,Barrier Method Formation This is a type of tra...,Ninjutsu
2,Banshō Kokuin Madara creates a black sphere th...,Ninjutsu
3,Barrage of Gentle Fists The user delivers a se...,Taijutsu
4,Banshō Ten'in This article is about the techni...,Ninjutsu


In [37]:
from bs4 import BeautifulSoup

class Cleaner():
    def __init__(self):
        pass

    def put_line_breaks(self, text):
        return text.replace('<\p>', '<\p>\n')
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, 'lxml').text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [38]:
text_column_name = 'text'
label_column_name = 'jutsu'

In [39]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

In [40]:
df.head(2)

Unnamed: 0,text,jutsu,text_cleaned
0,10 Hit Combo Lars punches the opponent before ...,Taijutsu,10 Hit Combo Lars punches the opponent before ...
1,Barrier Method Formation This is a type of tra...,Ninjutsu,Barrier Method Formation This is a type of tra...


In [41]:
# Encode labels
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [42]:
label_dict = {index: label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [43]:
df['label'] = le.transform(df[label_column_name].tolist())

In [44]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,10 Hit Combo Lars punches the opponent before ...,Taijutsu,10 Hit Combo Lars punches the opponent before ...,2
1,Barrier Method Formation This is a type of tra...,Ninjutsu,Barrier Method Formation This is a type of tra...,1
2,Banshō Kokuin Madara creates a black sphere th...,Ninjutsu,Banshō Kokuin Madara creates a black sphere th...,1
3,Barrage of Gentle Fists The user delivers a se...,Taijutsu,Barrage of Gentle Fists The user delivers a se...,2
4,Banshō Ten'in This article is about the techni...,Ninjutsu,Banshō Ten'in This article is about the techni...,1


In [46]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['label'])

In [48]:
df_train['jutsu'].value_counts()

jutsu
Ninjutsu    1817
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64

In [49]:
model_name = 'distilbert/distilbert-base-uncased'


In [51]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [52]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], truncation=True)

In [55]:
# Convert pandas to hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)

tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)


Map: 100%|██████████| 2216/2216 [00:00<00:00, 8857.54 examples/s]
Map: 100%|██████████| 554/554 [00:00<00:00, 9372.37 examples/s]
