In [48]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

# Load Dataset

In [20]:
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()

In [21]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [22]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [23]:
df.head()

In [24]:
df['jutsu_type_simplified'].value_counts()

In [25]:
df['text'] = df['jutsu_name'] + " " + df['jutsu_description']
df['jutsu'] = df['jutsu_type_simplified']
df = df[['text','jutsu']]
df = df.dropna()

In [26]:
df.head()

In [30]:
class Cleaner():
    def __init__(self):
        pass
    
    def put_line_breaks(self, text):
        return text.replace("<\\p>", "<\\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text
    
    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [31]:
text_column = 'text'
label_column_name = 'jutsu'


In [33]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column].apply(cleaner.clean)
df.head()

In [35]:
# Encode Labels
le = preprocessing.LabelEncoder()
le.fit((df[label_column_name]).tolist())


In [36]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

In [37]:
df['label'] = le.transform(df[label_column_name].tolist())
df.head()

In [39]:
test_size = 0.2
df_train, df_test = train_test_split(df,
                                     test_size=test_size,
                                     stratify=df['label']
)

In [40]:
df_train['jutsu'].value_counts()

In [44]:
model_name = "distilbert/distilbert-base-uncased"

In [45]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [50]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], padding="max_length", truncation=True)

In [51]:
# Conver Pandas to a huggingface dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                  batched=True)
