# Dataset generation

In [7]:
import polars as pl
from pathlib import Path
from ollama import chat

In [8]:
DATASET_LEN = 500
DATASET_FILE = Path("./test.ndjson")

In [9]:
# Load the dataset with unique mountain names
# Reference: https://www.kaggle.com/datasets/codefantasy/list-of-mountains-in-the-world

mn_df = pl.read_csv(Path("Mountain.csv"), columns=["Mountain"])
mn_df = mn_df.unique()
mn_lst = mn_df["Mountain"].to_list()
mn_df

Mountain
str
"""Mount Griggs"""
"""Diran"""
"""Kubi Gangri"""
"""Mount Massive"""
"""Dreiländerspitze"""
…
"""Liankang Kangri"""
"""Mount Moulton"""
"""Mount Lindsey"""
"""Ishpatina Ridge"""


## Dataset generation process

I will use ollama to host llm locally (llama3.1)

To make dataset diverse and reasonably 'realistic' I will use the following approach:

1. 40% of the data will be stories/articles/news/tweets about mountain __explitcitly__ saying its mountain with mentioning its name
2. 5% of the data will be stories/articles/news/tweets about mountain __explitcitly__ saying its mountain with mentioning its name (2-4 different mountain names)
3. 15% of the data will be stories/articles/news/tweets with mentioning name of the mountain __omitting__ the fact that it is a mountain, so that names like 'Twin peaks' would be dependant on its context (whether its a TV show or a mountain name)
4. 40% of the data will be random stories/articles/news/tweets

Each text will be short: 1-2 sentences.

P.S. Local generation of dataset with 1500 records took approx. 15 min

In [10]:
from random import choices, randint, seed
from tqdm import tqdm

# Set seed to maintain reproducible randomization
seed(42)

QUERY_TYPE = [1, 2, 3, 4]
QUERY_WEIGHT = [0.4, 0.05, 0.15, 0.4]
QUERY_THEME = ["story", "article", "piece of news", "tweet"]

SEED_PROMPT = "You are writer, for every prompt respond with 1-2 sentences."

raw_dataset = []

for i in tqdm(range(DATASET_LEN)):
    qtype = choices(QUERY_TYPE, QUERY_WEIGHT, k=1)[0]
    qtheme = choices(QUERY_THEME, k=1)[0]

    if qtype == 1:
        mname = choices(mn_lst)
        query = f"Write a {qtheme} about mountain {mname[0]}"
    elif qtype == 2:
        mname = choices(mn_lst, k=randint(2,4))
        query = f"Write a {qtheme} about mountains {','.join(mname)}"
    elif qtype == 3:
        mname = choices(mn_lst)
        query = f"Write a {qtheme} about {mname[0]}"
    elif qtype == 4:
        mname = []
        query = f"Write a {qtheme} on a random topic"
    
    response = chat(model="llama3.1", messages=[
        {"role": "system", "content": SEED_PROMPT}, 
        {"role": "user", "content": query},
    ], options={"seed": i})
    
    raw_dataset.append((qtype, response.message.content, mname))

100%|██████████| 500/500 [05:54<00:00,  1.41it/s]


## Transform raw dataset

1. Tokenize into generic tokens (text -> dicrete words)
2. Label tokenized data

In [11]:
from nltk import RegexpTokenizer

etypes = ["O", "B-MNT", "I-MNT"]

tokenizer = RegexpTokenizer(r"\w+")

def raw_transform(record):
    qtype, text, mnts = record
    tokens = tokenizer.tokenize(text.lower())
    labels = [0] * len(tokens)
    for mnt in mnts:
        mtokens = tokenizer.tokenize(mnt.lower())
        slabels = [1 if i == 0 else 2 for i in range(len(mtokens))]
        for i in range(len(tokens)):
            if tokens[i:i+len(mtokens)] == mtokens:
                labels[i:i+len(mtokens)] = slabels
    return (qtype, tokens, labels)

transformed_dataset = list(map(raw_transform, raw_dataset))

## Save dataset into the file

In [13]:
import json

with open(DATASET_FILE, "x") as fp:
    for qtype, tokens, labels in transformed_dataset:
        record = {
            "qtype": qtype,
            "tokens": tokens,
            "labels": labels,
        }
        fp.write(json.dumps(record))
        fp.write("\n")