In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


# Load Dataset

In [2]:
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Arm Growth Technique,Ninjutsu,"From the shoulders, the user is able to create..."
2,Armour-Eater,Ninjutsu,"The user melts pieces of their armour, creatin..."
3,Armageddon Countdown Clock,Ninjutsu,"After performing the hand seal, a special cloc..."
4,Arm of Shukaku,Ninjutsu,Gaara engulfs himself in a sand barrier as Shu...


In [3]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [4]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [5]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Arm Growth Technique,Ninjutsu,"From the shoulders, the user is able to create...",Ninjutsu
2,Armour-Eater,Ninjutsu,"The user melts pieces of their armour, creatin...",Ninjutsu
3,Armageddon Countdown Clock,Ninjutsu,"After performing the hand seal, a special cloc...",Ninjutsu
4,Arm of Shukaku,Ninjutsu,Gaara engulfs himself in a sand barrier as Shu...,Ninjutsu


In [6]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2265
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [7]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()

In [8]:
df.head()

Unnamed: 0,text,jutsus
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
1,"Arm Growth Technique. From the shoulders, the ...",Ninjutsu
2,Armour-Eater. The user melts pieces of their a...,Ninjutsu
3,Armageddon Countdown Clock. After performing t...,Ninjutsu
4,Arm of Shukaku. Gaara engulfs himself in a san...,Ninjutsu


In [9]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass 
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [10]:
text_column_name = 'text'
label_column_name = "jutsus"

In [11]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, "lxml").text


In [12]:
df.head(2)

Unnamed: 0,text,jutsus,text_cleaned
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...
1,"Arm Growth Technique. From the shoulders, the ...",Ninjutsu,"Arm Growth Technique. From the shoulders, the ..."


In [13]:
# Encode Labels 
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [14]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [15]:
df['label'] = le.transform(df[label_column_name].tolist())

In [16]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
1,"Arm Growth Technique. From the shoulders, the ...",Ninjutsu,"Arm Growth Technique. From the shoulders, the ...",1
2,Armour-Eater. The user melts pieces of their a...,Ninjutsu,Armour-Eater. The user melts pieces of their a...,1
3,Armageddon Countdown Clock. After performing t...,Ninjutsu,Armageddon Countdown Clock. After performing t...,1
4,Arm of Shukaku. Gaara engulfs himself in a san...,Ninjutsu,Arm of Shukaku. Gaara engulfs himself in a san...,1


In [17]:
test_size = 0.2
df_train, df_test = train_test_split(df, 
                                     test_size=test_size, 
                                     stratify=df['label'],)

In [18]:
df_train['jutsus'].value_counts()

jutsus
Ninjutsu    1812
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64

In [19]:
model_name = "distilbert/distilbert-base-uncased"

In [20]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [21]:
def preprocess_function(tokenizer,examples):
    return tokenizer(examples['text_cleaned'],truncation=True)

In [22]:
# Conver Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)

Map: 100%|██████████| 2211/2211 [00:00<00:00, 11925.41 examples/s]
Map: 100%|██████████| 553/553 [00:00<00:00, 8906.75 examples/s]
