In [1]:
import pandas as pd

data_path = 'data/jutsus.jsonl'
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Chakra Scalpel: Sever,"Ninjutsu, Medical Ninjutsu",Kabuto creates two larger than usual chakra sc...
1,Chakra Shockwave Slash,"Kenjutsu, Chakra Flow",The user rapidly slashes the opponent with a s...
2,Chakra Shock Slash,"Kenjutsu, Chakra Flow","The user swings their blade, releasing a blade..."
3,Chakra Scalpel: Destruction,"Ninjutsu, Medical Ninjutsu",Kabuto moves towards his opponent with tremend...
4,Chakra Scalpel: Cruelty,"Ninjutsu, Medical Ninjutsu",Kabuto attacks his opponent with his chakra sc...


In [33]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

In [2]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [3]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,Chakra Scalpel: Sever,"Ninjutsu, Medical Ninjutsu",Kabuto creates two larger than usual chakra sc...,Ninjutsu
1,Chakra Shockwave Slash,"Kenjutsu, Chakra Flow",The user rapidly slashes the opponent with a s...,
2,Chakra Shock Slash,"Kenjutsu, Chakra Flow","The user swings their blade, releasing a blade...",
3,Chakra Scalpel: Destruction,"Ninjutsu, Medical Ninjutsu",Kabuto moves towards his opponent with tremend...,Ninjutsu
4,Chakra Scalpel: Cruelty,"Ninjutsu, Medical Ninjutsu",Kabuto attacks his opponent with his chakra sc...,Ninjutsu


In [4]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    4128
Taijutsu     726
Genjutsu     188
Name: count, dtype: int64

In [5]:
df['text'] = df['jutsu_name'] + '.' + df['jutsu_description']
df['jutsu'] = df['jutsu_type_simplified']
df = df[['text','jutsu']]
df = df.dropna()

In [6]:
df.head()

Unnamed: 0,text,jutsu
0,Chakra Scalpel: Sever.Kabuto creates two large...,Ninjutsu
3,Chakra Scalpel: Destruction.Kabuto moves towar...,Ninjutsu
4,Chakra Scalpel: Cruelty.Kabuto attacks his opp...,Ninjutsu
5,Chakra Scalpel Snake Crush.Kabuto burrows unde...,Ninjutsu
6,Cat God Possession: Monster Cat Beckoning Tech...,Ninjutsu


In [7]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass

    def put_line_breaks(self, text):
        return text.replace('<\p>', '<\p>\n')
    
    def remove_html_tags(self, text):
        return BeautifulSoup(text, 'lxml').text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [9]:
text_column_name = 'text'
label_column_name = 'jutsu'

In [10]:
# Clean text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean) 

  return BeautifulSoup(text, 'lxml').text


In [11]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned
0,Chakra Scalpel: Sever.Kabuto creates two large...,Ninjutsu,Chakra Scalpel: Sever.Kabuto creates two large...
3,Chakra Scalpel: Destruction.Kabuto moves towar...,Ninjutsu,Chakra Scalpel: Destruction.Kabuto moves towar...
4,Chakra Scalpel: Cruelty.Kabuto attacks his opp...,Ninjutsu,Chakra Scalpel: Cruelty.Kabuto attacks his opp...
5,Chakra Scalpel Snake Crush.Kabuto burrows unde...,Ninjutsu,Chakra Scalpel Snake Crush.Kabuto burrows unde...
6,Cat God Possession: Monster Cat Beckoning Tech...,Ninjutsu,Cat God Possession: Monster Cat Beckoning Tech...


In [13]:
# Encode Labels
le = preprocessing.LabelEncoder()
le.fit(df['jutsu'].tolist())

In [14]:
df.head(2)

Unnamed: 0,text,jutsu,text_cleaned
0,Chakra Scalpel: Sever.Kabuto creates two large...,Ninjutsu,Chakra Scalpel: Sever.Kabuto creates two large...
3,Chakra Scalpel: Destruction.Kabuto moves towar...,Ninjutsu,Chakra Scalpel: Destruction.Kabuto moves towar...


In [16]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [17]:
df['label'] = le.transform(df[label_column_name].tolist())

In [18]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,Chakra Scalpel: Sever.Kabuto creates two large...,Ninjutsu,Chakra Scalpel: Sever.Kabuto creates two large...,1
3,Chakra Scalpel: Destruction.Kabuto moves towar...,Ninjutsu,Chakra Scalpel: Destruction.Kabuto moves towar...,1
4,Chakra Scalpel: Cruelty.Kabuto attacks his opp...,Ninjutsu,Chakra Scalpel: Cruelty.Kabuto attacks his opp...,1
5,Chakra Scalpel Snake Crush.Kabuto burrows unde...,Ninjutsu,Chakra Scalpel Snake Crush.Kabuto burrows unde...,1
6,Cat God Possession: Monster Cat Beckoning Tech...,Ninjutsu,Cat God Possession: Monster Cat Beckoning Tech...,1


In [21]:
test_size=0.2
df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['label'],)


In [22]:
df_train['jutsu'].value_counts()

jutsu
Ninjutsu    3302
Taijutsu     581
Genjutsu     150
Name: count, dtype: int64

In [23]:
df_test['jutsu'].value_counts()

jutsu
Ninjutsu    826
Taijutsu    145
Genjutsu     38
Name: count, dtype: int64

In [26]:
model_name = 'distilbert/distilbert-base-uncased'

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [28]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], truncation=True)

In [34]:
# Convert pandas to huggingface dataset
trained_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [35]:
# tokenize dataset
tokenized_train = trained_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched= True)

tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched= True)


Map: 100%|██████████| 4033/4033 [00:00<00:00, 5985.61 examples/s]
Map: 100%|██████████| 1009/1009 [00:00<00:00, 6298.87 examples/s]
