In [2]:
import pandas as pd
import re
import ast
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import classification_report

df_mountains = pd.read_csv('C:/ME/Coding/Data/Mountain/mountain_dataset_with_markers.csv')

#Function to remove hashtags
df_mountains['text'] = df_mountains['text'].apply(lambda x: re.sub(r'#\w+', '', x))

#Function to split words and included symbols
def custom_tokenize(text):
    return re.findall(r"\w+|[.,!?;]", text)

def generate_bio_labels(sentence, marker):
    words = custom_tokenize(sentence)
    labels = ["O"] * len(words)
    markers = ast.literal_eval(marker)
    
    for start, end in markers:
        char_index = 0
        for i, word in enumerate(words):
            word_start = char_index
            word_end = char_index + len(word)
            char_index = word_end + 1
            
            if word_start >= start and word_end <= end:
                if word_start == start:
                    labels[i] = "B-MOUNTAIN"
                else:
                    labels[i] = "I-MOUNTAIN"

    tokenized_text = "['" + "', '".join(words) + "']"
    tokenized_annotation = "['" + "', '".join(labels) + "']"
    return tokenized_text, tokenized_annotation

#Generating BIO labels and formatted tokens
df_mountains[['tokens', 'tags']] = df_mountains.apply(
    lambda row: generate_bio_labels(row['text'], row['markers']),
    axis=1, result_type='expand'
)
df_mountains.drop(columns=['text', 'markers'], inplace=True)

output_csv_filename = "C:/ME/Coding/Data/Mountain/data/mountain_new.csv"
df_mountains.to_csv(output_csv_filename, index=False, encoding="utf-8")

print(f"Tokenized dataset with BIO labels saved to {output_csv_filename}")


Tokenized dataset with BIO labels saved to C:/ME/Coding/Data/Mountain/data/mountain_new.csv


In [3]:
bio_df = pd.read_csv('C:/ME/Coding/Data/Mountain/data/mountain_new.csv')
annotated_df = pd.read_csv('C:/ME/Coding/Data/Mountain/annotated_sentences.csv')

bio_df = bio_df.rename(columns={'sentences': 'tokens', 'annotation': 'annotation'})
merged_df = pd.concat([bio_df, annotated_df], ignore_index=True)

output_csv_filename = "C:/ME/Coding/Data/Mountain/new_merged_mountain_dataset.csv"
merged_df.to_csv(output_csv_filename, index=False, encoding="utf-8")

print(f"Files successfully merged and saved to {output_csv_filename}")

Files successfully merged and saved to C:/ME/Coding/Data/Mountain/new_merged_mountain_dataset.csv


In [17]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import torch

#Loading tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=3)

df = pd.read_csv("C:/ME/Coding/Data/Mountain/new_merged_mountain_dataset.csv")
df['tokens'] = df['tokens'].apply(ast.literal_eval)
df['tags'] = df['tags'].apply(ast.literal_eval)

#Splitting Dataset for train and valitaion sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

#Assigning tags to integers
tag2id = {'O': 0, 'B-MOUNTAIN': 1, 'I-MOUNTAIN': 2}

#Align labels with tokens
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    label_index = 0
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        else:
            if label_index < len(labels):
                new_labels.append(tag2id[labels[label_index]])
            if len(new_labels) > 1 and word_id != word_ids[len(new_labels) - 2]:
                label_index += 1
    return new_labels

#Tokenize and align(adjust) labels
def tokenize_and_align_labels(df):
    tokenized_inputs = tokenizer(
        df['tokens'].tolist(),
        is_split_into_words=True,
        padding=True,
        truncation=True,
        return_offsets_mapping=True,
        max_length=512
    )
    labels = [
        align_labels_with_tokens(df['tags'].iloc[i], tokenized_inputs.word_ids(batch_index=i))
        for i in range(len(df))
    ]
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

train_tokenized_inputs = tokenize_and_align_labels(train_df)
val_tokenized_inputs = tokenize_and_align_labels(val_df)

#Creating Dataset Class, so PyTorch's DataLoader efficiently load in batches during training and correct formating
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):     #input_ids - tokenized words(token ID)
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = NERDataset(train_tokenized_inputs)
val_dataset = NERDataset(val_tokenized_inputs)

#DataCollator to make sure all sequences in a batch are the same length.
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

#Setting up the training arguments
training_args = TrainingArguments(
    output_dir='C:/ME/Coding/Data/Mountain/results_new',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='C:/ME/Coding/Data/Mountain/logs_new',
    learning_rate=2e-4,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

#Trainer setup and begin the training proccess
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

trainer.train()

#Evaluation
outputs = trainer.predict(val_dataset)
predictions = np.argmax(outputs.predictions, axis=2)

#Prepares the model's predictions and true labels
true_labels_flat, predicted_labels_flat = [], []

for i in range(len(val_tokenized_inputs['labels'])):
    true_labels = val_tokenized_inputs['labels'][i]
    predicted_labels_seq = predictions[i]

    for j in range(len(true_labels)):
        if true_labels[j] != -100:  # Ignore special tokens
            true_labels_flat.append(true_labels[j])
            predicted_labels_flat.append(predicted_labels_seq[j])

#Print metrics (Precision, Recall, F1-score)
target_names = ['O', 'B-MOUNTAIN', 'I-MOUNTAIN']
print(classification_report(true_labels_flat, predicted_labels_flat, target_names=target_names))

#Save the model
output_dir = "C:/ME/Coding/Data/Mountain/model_new"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model and tokenizer saved successfully.")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.032408
2,No log,0.010002
3,No log,0.052338


              precision    recall  f1-score   support

           O       1.00      1.00      1.00      7854
  B-MOUNTAIN       0.98      0.93      0.95       138
  I-MOUNTAIN       0.98      0.97      0.97       160

    accuracy                           1.00      8152
   macro avg       0.99      0.97      0.98      8152
weighted avg       1.00      1.00      1.00      8152

Model and tokenizer saved successfully.


In [23]:
model_dir = "C:/ME/Coding/Data/Mountain/model"
tokenizer = BertTokenizerFast.from_pretrained(model_dir)
model = BertForTokenClassification.from_pretrained(model_dir)

#Assigning tags to integers
id2tag = {0: 'O', 1: 'B-MOUNTAIN', 2: 'I-MOUNTAIN'}

def predict(text):
    #Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    model.eval()    #Set model to evaluation mode and get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())    #Convert token IDs to actual tokens

    predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()   #Get predicted labels
    predicted_labels = [id2tag[pred] for pred in predictions]
    token_label_pairs = list(zip(tokens, predicted_labels))      #Combine tokens and labels

    #Handling subword tokens
    combined_tokens_labels = []
    current_token = None
    current_label = "O"

    for token, label in token_label_pairs:
        if token.startswith("##"):         #Check if the token is a subword(##)
            if current_token:
                current_token += token[2:]
            else:
                current_token = token[2:]
        else:
            if current_token is not None:       #If it's existing token, append it with his label
                combined_tokens_labels.append((current_token, current_label))
            current_token = token
            current_label = label
    if current_token is not None:               #Last token-label pair if it exists
        combined_tokens_labels.append((current_token, current_label))
    return combined_tokens_labels

def extract_mountain_names(token_label_pairs):
    mountain_names = []
    current_mountain = []

    #mountain names with B-MOUNTAIN and I-MOUNTAIN labels
    for token, label in token_label_pairs:
        if label == "B-MOUNTAIN":
            if current_mountain:
                mountain_names.append(" ".join(current_mountain))
            current_mountain = [token]
        elif label == "I-MOUNTAIN" and current_mountain:
            current_mountain.append(token)
        else:
            if current_mountain:
                mountain_names.append(" ".join(current_mountain))
                current_mountain = []
    if current_mountain:
        mountain_names.append(" ".join(current_mountain))
    return mountain_names

text = input("Enter the text to analyze for mountain name: ")

token_label_pairs = predict(text)

mountain_names = extract_mountain_names(token_label_pairs)

print("\nNamed Entities Recognized as mountains name:")
for mountain_name in mountain_names:
    print(mountain_name)

print("\nTokens and labels predictions:")
for token, label in token_label_pairs:
    print(f'{token}: {label}')

Enter the text to analyze for mountain name:  Could you imagine, in the next month I will try to climb the Denali (Mount McKinley).



Named Entities Recognized as mountains name:
denali
mount mckinley

Tokens and labels predictions:
[CLS]: O
could: O
you: O
imagine: O
,: O
in: O
the: O
next: O
month: O
i: O
will: O
try: O
to: O
climb: O
the: O
denali: B-MOUNTAIN
(: O
mount: B-MOUNTAIN
mckinley: I-MOUNTAIN
): O
.: O
[SEP]: O
