In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/nlp-data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-data/train.json
/kaggle/input/nlp-data/test.json


In [3]:
import json
train_file_path = "/kaggle/input/nlp-data/train.json"
test_file_path = "/kaggle/input/nlp-data/test.json"
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)
 
train_data = load_data(train_file_path)
test_data = load_data(test_file_path)

In [5]:
import pandas as pd
from datasets import Dataset, load_metric

def json_to_dataframe(json_data):
    df = pd.DataFrame(json_data)
    return Dataset.from_pandas(df)

train_dataset = json_to_dataframe(train_data)
test_dataset = json_to_dataframe(test_data)

In [None]:
from transformers import DebertaTokenizerFast

tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=512
    )

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import DebertaForTokenClassification, TrainingArguments, Trainer

model = DebertaForTokenClassification.from_pretrained(
    "microsoft/deberta-base",
    num_labels=len(unique_labels)  
)

metric = load_metric("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=2)
    true_labels = [[label for (label, pred) in zip(labels[i], predictions[i]) if label != -100] for i in range(len(labels))]
    true_predictions = [[label_list[pred] for (label, pred) in zip(labels[i], predictions[i]) if label != -100] for i in range(len(labels))]
    return metric.compute(predictions=true_predictions, references=true_labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

sentence = "Steve Jobs founded Apple in Cupertino."

predictions = ner_pipeline(sentence)
print(predictions)