# Import from HuggingFace


In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="yuridrcosta/gpt2-finetuned-ner")


Device set to use cuda:0


In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("yuridrcosta/gpt2-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("yuridrcosta/gpt2-finetuned-ner")


In [4]:
%pip install transformers datasets


Note: you may need to restart the kernel to use updated packages.


In [7]:
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

def load_data_from_txt(file_path):
    sentences = []
    labels = []

    with open(file_path, 'r') as file:
        current_sentence = []
        current_labels = []
        
        for line in file:
            line = line.strip()
            if line:  # If the line is not empty
                parts = line.split()
                token = parts[0]
                label = parts[-1]  # Assuming the last part is the label
                current_sentence.append(token)
                current_labels.append(label)
            else:  # Blank line indicates the end of a sentence
                if current_sentence:  # Only append if there's data
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                    current_sentence = []
                    current_labels = []

    return sentences, labels

def sentences_to_dataframe(sentences, labels):
    # Convert to a DataFrame suitable for the Hugging Face Dataset
    data = {'tokens': sentences, 'ner_tags': labels}
    return pd.DataFrame(data)

def preprocess_data(file_path):
    sentences, labels = load_data_from_txt(file_path)
    df = sentences_to_dataframe(sentences, labels)

    # Split the DataFrame into train and validation sets
    train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

    # Convert DataFrames to Hugging Face Dataset format
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)

    return train_dataset, valid_dataset

# Load and preprocess data
file_path = './ner_conll_format.txt'  # Update with your path
train_dataset, valid_dataset = preprocess_data(file_path)


In [8]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Define the model name
model_name = "yuridrcosta/gpt2-finetuned-ner"  # Replace with your selected model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


In [11]:
def tokenize_and_align_labels(examples):
    # Tokenize the input with padding and truncation
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding='max_length', max_length=128)

    # Prepare the labels list
    labels = []
    
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # Default to -100 for padding tokens
        
        # Align labels with tokenized words
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:  # Ignore the padding tokens
                # Map label directly; if your label is an integer, we leave it as is
                label_ids[idx] = label[word_id]

        labels.append(label_ids)

    # Add labels to tokenized input
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,                  # The initialized model
    args=training_args,           # Training arguments
    train_dataset=train_dataset_tokenized,  # Training dataset
    eval_dataset=valid_dataset_tokenized    # Evaluation dataset
)


In [None]:
# Start training
trainer.train()


In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation results
print(f"Evaluation results: {eval_results}")


In [None]:
def predict(model, tokenizer, text):
    # Tokenize the input text
    inputs = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True, padding=True, truncation=True)
    
    # Get predictions
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(dim=2)  # Get predicted class indices

    # Convert predictions to labels
    label_map = {i: label for i, label in enumerate(model.config.id2label)}
    predicted_labels = [label_map[pred.item()] for pred in predictions[0]]

    return predicted_labels

# Example usage
sample_text = "Your sentence goes here."
predicted_labels = predict(model, tokenizer, sample_text)
print(predicted_labels)
