## imports

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


## (1) LOAD DATASET

In [2]:
data_path = "..\data\jutsus.jsonl"
df = pd.read_json(data_path,lines=True)
df.head(10)

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Adamantine Sealing Chains: Spiral Formation,"Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...",Kushina uses her chains to form a barrier whil...
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...
2,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ..."
3,Adamantine Seal: Monkey Yang Suppression,"Ninjutsu, Fūinjutsu, Cooperation Ninjutsu","After placing fūinjutsu tags in an area, the u..."
4,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ..."
5,Accelerated Armed Revolving Heaven,"Kekkei Genkai, Hiden, Ninjutsu, Fūinjutsu, Tai...",Tenten unseals several weapons from her scroll...
6,Absorption Sphere,Ninjutsu,"Using the Jutsu Absorption Arm, the user creat..."
7,Absolute: Fang Passing Fang,"Taijutsu, Collaboration Techniques",Kiba and Akamaru perform the Fang Passing Fang...
8,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...
9,16 Hit Combo,Taijutsu,"A very effective move, Ino uses this as a quic..."


In [3]:
# The function to run on all the jutsu types on all rows
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    
    if "Taijutsu" in jutsu:
        return "Taijutsu"
     

In [4]:
# apply simplify_jutsu() on all rows using pandas
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu) # simplify_jutsu is the function simplify_jutsu()

In [5]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,Adamantine Sealing Chains: Spiral Formation,"Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...",Kushina uses her chains to form a barrier whil...,Ninjutsu
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...,Ninjutsu
2,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ...",Ninjutsu
3,Adamantine Seal: Monkey Yang Suppression,"Ninjutsu, Fūinjutsu, Cooperation Ninjutsu","After placing fūinjutsu tags in an area, the u...",Ninjutsu
4,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ...",Taijutsu


In [6]:
# see the dataset
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2255
Taijutsu     397
Genjutsu     101
Name: count, dtype: int64

The above dataset with "Ninjutsu=2255 rows/dataPoints/dataSamples" and "Taijutsu=397 & Genjutsu=101" is called skew dataset as one data class (Ninjutsu) has sigificantly more dataSamples than the other clases. So this unbalenced dataset causes problem in model as it can lead to beig biased output in favour of Ninjutu. So solve it and make the Dataset balanced

In [7]:
df['text'] = df['jutsu_name'] + " " + df['jutsu_description']
df['jutsu']= df['jutsu_type_simplified']
df= df[['text', 'jutsu']] # take only the text and jutsu col
df = df.dropna() # drop any missing values


In [8]:
df.head()

Unnamed: 0,text,jutsu
0,Adamantine Sealing Chains: Spiral Formation Ku...,Ninjutsu
1,Adamantine Power: Acala Hashirama kicks the op...,Ninjutsu
2,Adamantine Prison Wall After using Transformat...,Ninjutsu
3,Adamantine Seal: Monkey Yang Suppression After...,Ninjutsu
4,"Acrobat The Acrobat (荒繰鷺伐刀, Akurobatto) is a k...",Taijutsu


In [9]:
from bs4 import BeautifulSoup
# A good practice, to have a cleaner func, to clean htmltags/unnecessary data from text
class Cleaner():
    def __init__(self):
        pass
    
    # To put line break after each paragraph and a new line
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text
    
    # the main function
    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        
        return text

In [10]:
text_column_name = 'text'
label_column_name = 'jutsu'

# clean text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)


  clean_text = BeautifulSoup(text, "lxml").text


In [11]:
df.head(5)

Unnamed: 0,text,jutsu,text_cleaned
0,Adamantine Sealing Chains: Spiral Formation Ku...,Ninjutsu,Adamantine Sealing Chains: Spiral Formation Ku...
1,Adamantine Power: Acala Hashirama kicks the op...,Ninjutsu,Adamantine Power: Acala Hashirama kicks the op...
2,Adamantine Prison Wall After using Transformat...,Ninjutsu,Adamantine Prison Wall After using Transformat...
3,Adamantine Seal: Monkey Yang Suppression After...,Ninjutsu,Adamantine Seal: Monkey Yang Suppression After...
4,"Acrobat The Acrobat (荒繰鷺伐刀, Akurobatto) is a k...",Taijutsu,"Acrobat The Acrobat (荒繰鷺伐刀, Akurobatto) is a k..."


In [12]:
# Encode Labels / tokenize the inputs / so that it can have outputs also as numbers
# basically labelling all the jutsus as number Nijutsu =1, taijutsu=2 etc
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [13]:
# into a dict
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [14]:
df['label'] = le.transform(df[label_column_name].tolist())

In [None]:
df.head()

In [15]:
test_size = 0.2
df_train, df_test = train_test_split(df, 
                                     test_size=test_size, 
                                     stratify=df['label'],)

# stratify= ennsures that all three classes have 80% in train and 20% in test

kept 80% data of ALL THREE CLASSES in train set
Train(80%) --> 80% proprotion of Ninjutsu , taijutsu and genjutsu
Test(20%)


In [16]:
df_train['jutsu'].value_counts()

jutsu
Ninjutsu    1804
Taijutsu     317
Genjutsu      81
Name: count, dtype: int64

In [17]:
model_name = "distilbert/distilbert-base-uncased"

In [18]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [19]:
def preprocess_function(tokenizer,examples):
    return tokenizer(examples['text_cleaned'],truncation=True) 
# if any text above 512 token it truncates so that it doenst break the model

In [20]:
# Conver Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)

Map: 100%|██████████| 2202/2202 [00:00<00:00, 15073.95 examples/s]
Map: 100%|██████████| 551/551 [00:00<00:00, 18349.62 examples/s]
