In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile
import os

os.makedirs("/content/various_sentences/", exist_ok=True)
# from https://www.kaggle.com/datasets/naseralqaydeh/named-entity-recognition-ner-corpus/
# locally NLP/datasets/partial_datasets/ner.csv.zip

with zipfile.ZipFile("/content/drive/MyDrive/Datasets/NLP/ner.csv.zip", "r") as zf:
  zf.extractall("/content/various_sentences/")


In [None]:
import pandas as pd
import ast

# read csv to df 
def load_csv_data(data_path):
    df = pd.read_csv(data_path)
    df.dropna(inplace=True)
    print("Number of rows : ",df.shape[0]," and the number of columns : ",df.shape[1])
    return df

ner_df = load_csv_data("/content/various_sentences/ner.csv")
# necessary to convert to usable format for df
def preprocess_ner_df(df):
    for i in range(len(df)):
        pos = ast.literal_eval(df.loc[i, 'POS'])
        tags = ast.literal_eval(df.loc[i, 'Tag'])
        df.loc[i, 'POS'] = [str(word) for word in pos]
        df.loc[i, 'Tag'] = [str(word.upper()) for word in tags]
    return df

ner_df = preprocess_ner_df(ner_df)
# i need only sentences, annotations
ner_df = ner_df[['Sentence','Tag']]

ner_df.head()

Number of rows :  47959  and the number of columns :  4


Unnamed: 0,Sentence,Tag
0,Thousands of demonstrators have marched throug...,"[O, O, O, O, O, O, B-GEO, O, O, O, O, O, B-GEO..."
1,Families of soldiers killed in the conflict jo...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,They marched from the Houses of Parliament to ...,"[O, O, O, O, O, O, O, O, O, O, O, B-GEO, I-GEO..."
3,"Police put the number of marchers at 10,000 wh...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,The protest comes on the eve of the annual con...,"[O, O, O, O, O, O, O, O, O, O, O, B-GEO, O, O,..."


In [4]:
tag_values = list(ner_df.Tag.values)
sentences = list(ner_df.Sentence.values)
del ner_df
print(tag_values[:5])
print(sentences[:5])

[['O', 'O', 'O', 'O', 'O', 'O', 'B-GEO', 'O', 'O', 'O', 'O', 'O', 'B-GEO', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GEO', 'I-GEO', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GEO', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'B-GEO', 'O']]
['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "', 'They marched from the Houses of Parliament to a rally in Hyde Park .', 'Police put the

In [5]:
unique_tags = set([tag for subl in tag_values for tag in subl])
print(unique_tags)

{'I-TIM', 'B-GPE', 'B-GEO', 'I-GEO', 'I-PER', 'B-EVE', 'I-EVE', 'B-ORG', 'B-TIM', 'O', 'B-PER', 'I-ORG', 'I-ART', 'B-NAT', 'I-NAT', 'I-GPE', 'B-ART'}



    geo = Geographical Entity

    org = Organization

    per = Person

    gpe = Geopolitical Entity

    tim = Time indicator

    art = Artifact

    eve = Event

    nat = Natural Phenomenon

In [None]:
non_geo_sentences = []
#obviously exclude GEO
for tag_l, sentence in zip(tag_values, sentences):
  if 'I-GEO' in tag_l or 'B-GEO' in tag_l:
    continue
  else:
    non_geo_sentences.append(sentence)

In [7]:
del tag_values
del sentences
print(len(non_geo_sentences))

23548


In [None]:
import random

# already know how much samples i need -> sample to reduce resource usage
random.shuffle(non_geo_sentences)
non_geo_sentences = random.sample(non_geo_sentences, 16000)

In [None]:
import re
import random
import math
from transformers import BertTokenizerFast
import json

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def tokenize_and_label_regular(tokenizer, sentence, tokens, labels):
    for word in sentence.strip().split():
        tokenized_word = tokenizer.tokenize(word)
        tokens.extend(tokenized_word)
        labels.extend(["O"] * len(tokenized_word))

def process_sentence(sentence):
    tokens, labels = [], []
    tokenize_and_label_regular(tokenizer, sentence, tokens, labels)
    return tokens, labels

def process_entry(entry):
    tokens, labels = process_sentence(entry)
    return (tokens, labels)

non_mount_data = []
for entry in non_geo_sentences:
  if entry:
    non_mount_data.append(process_entry(entry))
del non_geo_sentences
random.shuffle(non_mount_data)

In [10]:
print(non_mount_data[:5])

[(['Sal', '##azar', 'said', 'the', 'End', '##anger', '##ed', 'Species', 'Act', 'was', 'not', 'the', '"', 'proper', 'mechanism', '"', 'for', 'dealing', 'with', 'climate', 'change', 'and', 'said', 'a', 'more', 'comprehensive', 'strategy', 'is', 'needed', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']), (['The', 'government', 'has', 'requested', 'assistance', 'from', 'the', 'I', '##MF', 'and', 'from', 'the', 'African', 'Development', 'Bank', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']), (['They', 'spent', 'a', 'night', 'in', 'jail', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O']), (['U', '.', 'S', '.', 'Secretary', 'of', 'State', 'Con', '##do', '##lee', '##zza', 'Rice', 'says', 'Israeli', 'and', 'Palestinian', 'representatives', 'are', 'discussing', 'peace', 'initiatives', 'behind', 'closed', 'doors', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [None]:
os.makedirs("/content/synthetic/")
os.makedirs("/content/confusing_synthetic/")

# same as in NLP/datasets/partial_datasets/synthetic_batch_1.zip
with zipfile.ZipFile("/content/drive/MyDrive/Datasets/NLP/synthetic_batch1.zip", "r") as zf:
  zf.extractall("/content/synthetic/")

# same as in NLP/datasets/partial_datasets/confusing_synthetic.zip
with zipfile.ZipFile("/content/drive/MyDrive/Datasets/NLP/confusing_synthetic.zip", "r") as zf:
  zf.extractall("/content/confusing_synthetic/")

# same as in NLP/datasets/partial_datasets/synthetic_batch_2.zip
with zipfile.ZipFile("/content/drive/MyDrive/Datasets/NLP/synthetic_batch2.zip", "r") as zf:
  zf.extractall("/content/synthetic_batch2/")

In [None]:
import json

# elevations in sentences - separated by ||:
# It was only a week since Jack climbed ||Big Bell Mountain|| and now he is heading to the ||Japanese Alps|| for the next mountain on his list.

# for each 10 sentences (output from Llama) we need to extract such elevations
# if extracted name equals to given in prompt - it's mountain, otherwise - elevation (range, plateau)

# distortions - need to be handled like 's in: Kilimanjaro's snow-capped peaks are famous worldwide. 

def get_v_bar_start_indexes(string):
    """Find starting indexes of double vertical bars (||) in the string."""
    return [match.start() for match in re.finditer(r'\|\|', string)]

def check_acceptable_distortion(entity_with_markers, entity_name, is_conf):
    acceptable_distortions = [entity_name + "'s", entity_name + ",", entity_name + "'s,", entity_name + ".", entity_name]
    if is_conf:
        acceptable_distortions += [entity_name.split()[1]] if len(entity_name.split()) > 1 else []
    return entity_with_markers in acceptable_distortions

def tokenize_and_label_regular(tokenizer, sentence, tokens, labels):
    for word in sentence.strip().split():
        tokenized_word = tokenizer.tokenize(word)
        tokens.extend(tokenized_word)
        labels.extend(["O"] * len(tokenized_word))

def tokenize_and_label_elevation(tokenizer, string, tokens, labels, entity_name, is_conf):
    entity = re.sub(r"\|\|", "", string).strip()
    tokenized_entity = tokenizer.tokenize(entity)
    if check_acceptable_distortion(entity, entity_name, is_conf):
        for i, token in enumerate(tokenized_entity):
            if token in {"'s", ",", "."}:
                tokens.append(token)
                labels.append("O")
            else:
                tokens.append(token)
                labels.append("B-MOUNTAIN" if i == 0 else "I-MOUNTAIN")
    else:
        tokens.extend(tokenized_entity)
        labels.extend(["B-ELEVATION"] + ["I-ELEVATION"] * (len(tokenized_entity) - 1))

def process_sentence(sentence, entity_name, is_conf):
    tokens, labels = [], []
    entity_name = entity_name.strip()
    v_bar_starting_indexes = get_v_bar_start_indexes(sentence)

    if len(v_bar_starting_indexes) % 2 != 0:
        print(f"Error: Uneven count of bars in sentence: {sentence}")
        return None

    left = 0
    for i in range(0, len(v_bar_starting_indexes), 2):
        start, end = v_bar_starting_indexes[i], v_bar_starting_indexes[i + 1] + 2
        if left < start:
            tokenize_and_label_regular(tokenizer, sentence[left:start], tokens, labels)
        elev = sentence[start:end]
        tokenize_and_label_elevation(tokenizer, elev, tokens, labels, entity_name, is_conf)
        left = end
    if left < len(sentence):
        tokenize_and_label_regular(tokenizer, sentence[left:], tokens, labels)

    if not tokens or not labels or len(tokens) != len(labels):
        print(f"Invalid tokens or labels generated for sentence: {sentence}")
        return None

    return tokens, labels

def process_entry(entry, is_conf):
    sentences = []
    for obj in entry:
        if not obj or not isinstance(obj, dict):
            print(f"Invalid object in entry: {obj}")
            continue
        for name, sentences_list in obj.items():
            if not isinstance(sentences_list, list):
                print(f"Invalid sentences list for name '{name}': {sentences_list}")
                continue
            for sentence in sentences_list:
                processed_sentence = process_sentence(sentence, name, is_conf)
                if processed_sentence:
                    sentences.append(processed_sentence)
                else:
                    print(f"Failed to process sentence: {sentence}")
    return sentences if sentences else None

# Process synthetic data
synthetic_data = []
synthetic_data_dir = "/content/synthetic/"
if os.path.exists(synthetic_data_dir):
    for json_file in os.listdir(synthetic_data_dir):
        filename = os.path.join(synthetic_data_dir, json_file)
        with open(filename, "r") as f:
            try:
                synthetic_data += json.load(f)
            except json.JSONDecodeError as e:
                print(f"Failed to load JSON file {json_file}: {e}")
else:
    print(f"Synthetic data directory not found: {synthetic_data_dir}")

converted_synthetic_data = []
for entry in synthetic_data:
    if entry:
        processed = process_entry(entry, False)
        if processed:
            converted_synthetic_data.extend(processed)

random.shuffle(converted_synthetic_data)
val_len = math.ceil(len(converted_synthetic_data) * 0.15)
train_len = len(converted_synthetic_data) - val_len

print(f"Train size: {train_len}, Validation size: {val_len}")


Train size: 4530, Validation size: 800


In [None]:
# batch2 had increased temperature -> I handle invalid samples without throwing errors, repairing etc.

synthetic_data_batch2 = []
synthetic_data_batch2_dir = "/content/synthetic_batch2/"

# Load JSON files from batch 2 directory
if os.path.exists(synthetic_data_batch2_dir):
    for json_file in os.listdir(synthetic_data_batch2_dir):
        filename = os.path.join(synthetic_data_batch2_dir, json_file)
        try:
            with open(filename, "r") as f:
                synthetic_data_batch2 += json.load(f)
        except json.JSONDecodeError as e:
            print(f"Failed to load JSON file {json_file}: {e}")
else:
    print(f"Synthetic batch 2 directory not found: {synthetic_data_batch2_dir}")

# Process entries from batch 2
converted_synthetic_data_batch2 = []
for entry in synthetic_data_batch2:
    if entry:
        processed_entry = process_entry(entry, False)
        if processed_entry:
            # Ensure all processed entries are valid
            valid_entries = [pe for pe in processed_entry if pe and len(pe) == 2]
            if valid_entries:
                converted_synthetic_data_batch2.extend(valid_entries)
            else:
                print(f"Invalid processed entry: {processed_entry}")
        else:
            print(f"Failed to process entry: {entry}")

# Shuffle and split batch 2 data
random.shuffle(converted_synthetic_data_batch2)
val_len_batch2 = math.ceil(len(converted_synthetic_data_batch2) * 0.15)
train_len_batch2 = len(converted_synthetic_data_batch2) - val_len_batch2

print(f"Batch 2 - Train size: {train_len_batch2}, Validation size: {val_len_batch2}")


Error: Uneven count of bars in sentence: Big Bell Mountain|| stands out due to its distinct geographical features amidst the vast landscape of the ||Japanese Alps||.
Failed to process sentence: Big Bell Mountain|| stands out due to its distinct geographical features amidst the vast landscape of the ||Japanese Alps||.
Error: Uneven count of bars in sentence: It was only a week since Jack climbed Big Bell Mountain|| and now he is heading to the ||Japanese Alps|| for the next mountain on his list.
Failed to process sentence: It was only a week since Jack climbed Big Bell Mountain|| and now he is heading to the ||Japanese Alps|| for the next mountain on his list.
Error: Uneven count of bars in sentence: Big Bell Mountain||is an unassuming giant, its beauty lying in the subtleties of its rugged yet serene environment.
Failed to process sentence: Big Bell Mountain||is an unassuming giant, its beauty lying in the subtleties of its rugged yet serene environment.
Failed to process entry: [{'Fat

In [None]:
# Intuition of confusing sentences can be understood from example (one of samples):
# [
#     {
#         "name": "Mount Vernon",
#         "sentences_not_about_mountain": [
#             "Vernon's art studio was filled with vibrant colors and eclectic pieces that reflected her personality.",
#             "The Vernon family has been farming this land for generations, cultivating a rich heritage.",
#             "Professor Vernon's lectures on history were always engaging, making complex topics accessible to her students.",
#             "The Vernon brothers started their own business, specializing in sustainable and eco-friendly products.",
#             "Vernon's poetry collection explored themes of love, loss, and the human condition."
#         ],
#         "sentences_about_mountain_implied_from_context": [
#             "The ||Vernon||'s grandeur was a sight to behold, its snow-capped peak glistening in the morning sun.",
#             "As the sun dipped below the horizon, the ||Vernon|| range was set ablaze with a kaleidoscope of colors.",
#             "The ||Vernon||'s rugged terrain and unpredictable weather made it a formidable challenge for even the most experienced climbers."
#         ],
#         "sentences_directly_about_mountain": [
#             "||Mount Vernon|| is a beloved destination for those seeking a mix of adventure and historical significance.",
#             "The scenic trails surrounding ||Mount Vernon|| offer breathtaking views and a chance to connect with nature."
#         ]
#     }
# ]

def process_conf_entry(entry):
    sentences_not_about_mountain = []
    sentences_about_mountain_implied_from_context = []
    sentences_directly_about_mountain = []
    for obj in entry:
        if not obj:
            continue
        name = obj['name']
        for key, value in obj.items():
            if key == "name":
              continue
            elif key == "sentences_not_about_mountain":
              for sentence in value:
                tokens, labels = process_sentence(sentence, name, is_conf=True)
                sentences_not_about_mountain.append((tokens, labels))
            elif key == "sentences_about_mountain_implied_from_context":
              for sentence in value:
                tokens, labels = process_sentence(sentence, name, is_conf=True)
                sentences_about_mountain_implied_from_context.append((tokens, labels))
            elif key == "sentences_directly_about_mountain":
              for sentence in value:
                tokens, labels = process_sentence(sentence, name, is_conf=True)
                sentences_directly_about_mountain.append((tokens, labels))

    return (sentences_not_about_mountain, sentences_about_mountain_implied_from_context, sentences_directly_about_mountain)

import random
import math

conf_synthetic_data = []
conf_synthetic_data_dir = "/content/confusing_synthetic"
for json_file in os.listdir(conf_synthetic_data_dir):
  filename = os.path.join(conf_synthetic_data_dir, json_file)
  with open(filename, "r") as f:
      conf_synthetic_data += json.load(f)

sentences_not_mount = []
sentences_mount_from_context = []
sentences_directly_mount = []

for entry in conf_synthetic_data:
  if entry:
    not_mount, mount_from_context, directly_mount = process_conf_entry(entry)
    sentences_not_mount.extend(not_mount)
    sentences_mount_from_context.extend(mount_from_context)
    sentences_directly_mount.extend(directly_mount)

random.shuffle(sentences_not_mount)
random.shuffle(sentences_mount_from_context)
random.shuffle(sentences_directly_mount)

print(len(sentences_not_mount), len(sentences_mount_from_context), len(sentences_directly_mount))

240 144 96


In [None]:
# concat all synthetic shuffled data to validation, train sets

val_not_mount_len = math.ceil(len(sentences_not_mount)*0.15)
val_mount_from_context_len = math.ceil(len(sentences_mount_from_context)*0.15)
val__directly_mount_len = math.ceil(len(sentences_directly_mount)*0.15)

val_samples = converted_synthetic_data[:val_len]
train_samples = converted_synthetic_data[val_len:]

val_samples += sentences_not_mount[:val_not_mount_len]
val_samples += sentences_mount_from_context[:val_mount_from_context_len]
val_samples += sentences_directly_mount[:val__directly_mount_len]

train_samples += sentences_not_mount[val_not_mount_len:]
train_samples += sentences_mount_from_context[val_mount_from_context_len:]
train_samples += sentences_directly_mount[val__directly_mount_len:]

val_samples += converted_synthetic_data_batch2[:val_len_batch2]
train_samples += converted_synthetic_data_batch2[val_len_batch2:]

In [16]:
train_len = len(train_samples)
val_len = len(val_samples)
print(train_len, val_len)

10297 1820


In [None]:
# add 50% of non mount samples
non_mount_data = random.sample(non_mount_data, train_len + val_len)
val_samples += non_mount_data[:val_len]
train_samples += non_mount_data[:train_len]

print(len(train_samples), len(val_samples))

20594 3640


In [None]:
def save_to_json(samples, output_path):
    """
    Save the tokenized samples to a JSON file for fine-tuning.
    Each entry includes `tokens` and `labels`.
    """

    formatted_data = [{"tokens": tokens, "labels": labels} for tokens, labels in samples]
    with open(output_path, "w") as f:
        json.dump(formatted_data, f, indent=4)
# in repo: NLP/datasets/training_dataset.json
output_path = "/content/training_dataset.json"
save_to_json(train_samples, output_path)

# in repo: NLP/datasets/val_dataset.json
output_path = "/content/val_dataset.json"
save_to_json(val_samples, output_path)
