In [54]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

# Load Datasetm

In [29]:
data_path = "../data/jutsu.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Asura Path,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about information on the abili...
1,10 Hit Combo,Taijutsu,Lars strikes the foe 5 times before striking t...
2,Assassination Technique,"Kenjutsu, Fighting Style",A sword technique used by Root members. Using ...
3,Art (jutsu),"Kekkei Genkai, Hiden, Ninjutsu",This article is about the jutsu from Naruto Sh...
4,Asura Attack,"Kekkei Genkai, Ninjutsu, Dōjutsu","With the body modifications of the Asura Path,..."


In [30]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"
    

In [31]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [32]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,Asura Path,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about information on the abili...,Ninjutsu
1,10 Hit Combo,Taijutsu,Lars strikes the foe 5 times before striking t...,Taijutsu
2,Assassination Technique,"Kenjutsu, Fighting Style",A sword technique used by Root members. Using ...,
3,Art (jutsu),"Kekkei Genkai, Hiden, Ninjutsu",This article is about the jutsu from Naruto Sh...,Ninjutsu
4,Asura Attack,"Kekkei Genkai, Ninjutsu, Dōjutsu","With the body modifications of the Asura Path,...",Ninjutsu


In [33]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2272
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [34]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()

In [35]:
df.head()

Unnamed: 0,text,jutsus
0,Asura Path. This article is about information ...,Ninjutsu
1,10 Hit Combo. Lars strikes the foe 5 times bef...,Taijutsu
3,Art (jutsu). This article is about the jutsu f...,Ninjutsu
4,Asura Attack. With the body modifications of t...,Ninjutsu
5,Ascension and Fall. Obito slashes the opponent...,Ninjutsu


In [36]:
from bs4 import BeautifulSoup

class Cleaner():
    def __init__(self):
        pass
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")

    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

  return text.replace("<\p>", "<\p>\n")
  return text.replace("<\p>", "<\p>\n")


In [37]:
text_column_name = 'text'
label_column_name = 'jutsus'

In [38]:
# Clean the text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

In [39]:
df.head(2)

Unnamed: 0,text,jutsus,text_cleaned
0,Asura Path. This article is about information ...,Ninjutsu,Asura Path. This article is about information ...
1,10 Hit Combo. Lars strikes the foe 5 times bef...,Taijutsu,10 Hit Combo. Lars strikes the foe 5 times bef...


In [40]:
# Encode the labels
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())


In [42]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [43]:
df['label'] = le.transform(df[label_column_name].tolist())

In [44]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,Asura Path. This article is about information ...,Ninjutsu,Asura Path. This article is about information ...,1
1,10 Hit Combo. Lars strikes the foe 5 times bef...,Taijutsu,10 Hit Combo. Lars strikes the foe 5 times bef...,2
3,Art (jutsu). This article is about the jutsu f...,Ninjutsu,Art (jutsu). This article is about the jutsu f...,1
4,Asura Attack. With the body modifications of t...,Ninjutsu,Asura Attack. With the body modifications of t...,1
5,Ascension and Fall. Obito slashes the opponent...,Ninjutsu,Ascension and Fall. Obito slashes the opponent...,1


In [46]:
test_size = 0.2
df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['label'],)

In [47]:
df_train['jutsus'].value_counts()

jutsus
Ninjutsu    1817
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64

In [48]:
model_name = "distilbert/distilbert-base-uncased"

In [59]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [60]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], truncation=True)

In [61]:
# Convert Pandas to Hugging Face Dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Tokenize the dataset
tokenize_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),batched=True)
tokenize_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),batched=True)


Map: 100%|██████████| 2216/2216 [00:00<00:00, 5048.22 examples/s]
Map: 100%|██████████| 555/555 [00:00<00:00, 6488.62 examples/s]
