# Import neseccary packages

In [1]:
import openai
import pandas as pd
from html.parser import HTMLParser
import urllib.request
from deep_translator import GoogleTranslator
import nltk
import json
import numpy as np
import glob
import os

nltk.download('punkt')
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_row', None)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ihorkostiuk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
API_KEY = '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'
openai.api_key = API_KEY

# Idea

Unfortunately, there I have not found any suitable dataset for this task so far. To overcome this issue I will generate dataset on my own, using API to chatGPT. The main idea is to extract all popular names of the mountains and pass to them chat GPT, to generate sentences. Also it has to be said, that ChatGPT has strong biases and can generate very similar results (duplicates), that can greatly influence the quality of the dataset.

To resolve this issue, I will also gather decent amount of data from articles written by people (or I think, that they are written by people, who knows). They will be included in the resulting dataset.

At the end we will have three columns in our dataset: Sentence, Tags, Source. Source means whether it was human written, gpt-generated (so there will be two classes: HUMAN, GPT). Also tags will be B-mount, I-mount and O.

<img src='./misc/Data Generation.jpg'/>

# Extracting mountain names

First, to generated desirable sntences we need to consider extracting all known mountain names around the world. To accomplish this, we will define custom HTML-parser and parse List of all mountains on Wikipedia. In the end we will have 1500 names of the most popular mountains!

## Defining useful functions

In [3]:
class CustomParser(HTMLParser):
    def __init__(self, dataset: list, **kwargs):
        super().__init__(**kwargs)

        self.in_table = False
        self.first = False
        self.dataset = dataset
    
    
    def handle_data(self, data: str) -> None:
        if self.in_table and self.first:
            if data.strip() != '':
                self.dataset.append(data)
                self.first = False


    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if tag == 'tbody':
            self.in_table = True
        
        if tag == 'tr' and self.in_table:
            self.first = True


    def handle_endtag(self, tag: str) -> None:
        if tag == 'tbody':
            self.in_table = False


def extract_webpage(url: str) -> str:
    fp = urllib.request.urlopen(url=url)
    mybytes = fp.read()

    mystr = mybytes.decode('utf8')
    fp.close()

    return mystr

## Extracting all names

Along the way I will encounter also non-ascii symbols, we will transliterate them using 'transliterate' python package. Also some of the mountain names on the wikipedia page have something included in '()', which can be an alternative name, or nearby location, so I will not include them in the result.

In [None]:
mountains_page = 'https://en.wikipedia.org/wiki/List_of_mountains_by_elevation'
html_page = extract_webpage(url=mountains_page)
test_data = []
parser = CustomParser(dataset=test_data)
parser.feed(html_page)

translator = GoogleTranslator(source='auto', target='english')
test_data = [name.split('(')[0].strip() for name in test_data]
result_data = [name if name.isascii() else translator.translate(name) for name in test_data if name != 'Mountain']

data = pd.DataFrame(result_data, columns=['Mountain'])
data.to_csv('mountain_names.csv', index=False)

# Generating NER dataset using Python

Now I will use openai API, to generate sentences using ChatGPT. Then I will provide encoding for each of this sentences. To generate such sentences I will use from 1 to 3 names, which I will randomly extract from the dataset of names.

## Defining useful functions

In [3]:
data = pd.read_csv('./data/mountain_names.csv')
system_role = ['adventurer', 'journalist', 'from National Geographic', 'author', 'scientist', 'climber', 'naturalist']
actions = ['travveling around the globe', 'writing an article', 'describing what you see', 'expressing your feelengs', 'embracing nature']


def ask_openai(prompt: str) -> str:
    '''
        This function sends request to the OpenAI API using system and user prompt.
    '''
    role = np.random.choice(system_role)
    action = np.random.choice(actions)

    print('Role: %s, action: %s' % (role, action))

    system_content = 'You are {role} {action}.'.format(role=role, action=action)

    completion = openai.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {'role': 'system', 'content': system_content},
            {"role": "user", "content": prompt}
        ],
        max_tokens=512, # Unfortunately, I can't afford more.
        temperature=0.5 # I am not concerned about truthfulness of statements, but rather trying to make bot as creative as possible. 
    )

    return completion.choices[0].message.content


def get_random_names(names: pd.DataFrame) -> list:
    indeces = np.random.choice(len(names), np.random.choice([1, 2, 3], p=[0.7, 0.2, 0.1]))

    return names.loc[indeces]


def log_json(json_str, file: str='./data/logs/log.txt'):
    '''
        This function is for saving strings in case something goes wrong.
    '''
    with open(file=file, mode='a') as f:
        f.write(json_str + '\n')


def generating_loop(template: str=None, iterations: int=20, sentence_num: int=5, names_dataset: pd.DataFrame=None) -> pd.DataFrame:
    '''
        This function goes in a loop for iterations times. Each iteration generates its own sentences in quatity sentence_num.
    Function returns pandas dataframe with columns: sentence, expected_tokens.
    '''
    result = []

    template_2 = '''Using the following mountain names: {lists}. For each set of names generate sentence with them. Answer generate in JSON format with 'sentence', 'names' keywords.
                    All mountain entities must be included in 'names'.'''

    for itr_index in range(iterations):
        temp_list = []
        for sentence_index in range(sentence_num):
            names = get_random_names(names_dataset)
            names = names['Mountain'].values.tolist()
            temp_list.append(names)
        
        prompt = template_2.format(lists=temp_list)

        response = ask_openai(prompt=prompt)
        try:
            loaded_response = json.loads(response)
            log_json(response)
        except Exception as e:
            print('Caught an exception: ', e)
            continue
        finally:
            print(response)
        

        for mountain_dict in loaded_response['sentences']:
            result.append(mountain_dict)

    return pd.DataFrame(result)

## Functions for postprocessing responses

In [4]:
def append_tags(names: list, sentence: str, dataframe: pd.DataFrame) -> pd.DataFrame:
    begin_t = 'B-mount'
    inter_t = 'I-mount'
    zero_t = 'O'
    
    expected_tokens = []
    for name in names:
        expected_tokens.extend(nltk.word_tokenize(name))

    result = []
    begin = True
    for word in nltk.word_tokenize(sentence):
        if word in expected_tokens:
            if begin:
                begin = False
                result.append(begin_t)
            else:
                result.append(inter_t)
        else:
            begin = True
            result.append(zero_t)

    dataframe.loc[len(dataframe)] = [names, sentence, result]

    return dataframe


def add_entries(entries: str, dataframe: pd.DataFrame) -> pd.DataFrame:
    for entry_dict in entries['sentences']:
        dataframe.loc[len(dataframe)] = [entry_dict['names'], entry_dict['sentence']]

    return dataframe


def load_logs(filename: str) -> str:
    content = ''
    with open(filename, 'r') as f:
        content = f.read()

    return json.loads(content)


def generate_tags(logs_filename: str) -> pd.DataFrame:
    logs = load_logs(logs_filename)
    dataset = pd.DataFrame(columns=['names', 'sentence'])
    result = pd.DataFrame(columns=['names', 'sentence', 'tags'])

    for log in logs:
        dataset = add_entries(entries=log, dataframe=dataset)

    for index in range(len(dataset)):
        entry = dataset.loc[index]
        result = append_tags(entry['names'], entry['sentence'], result)

    return result

In [6]:
def extract_entities_openai(sentences):
    example ='''{
                    "sentences": [
                        {
                        "names": ["Shiprock"],
                        "sentence": "Shiprock rises majestically from the desert landscape, a striking solitary peak."
                        },
                        {
                        "names": ["Tödi"],
                        "sentence": "Tödi, also known as Piz Russein, is the highest mountain in the Glarus Alps of Switzerland."
                        }
                    ]
                }'''
    
    template = 'In the sentences list: {sentences} for each sentence find entities of mountain names, ranges and give them in JSON format like {example}, where names - is the entities, and sentence - is the sentence containing those entities.'

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {'role': 'user', 'content': template.format(sentences=sentences, example=example)}
        ],
        temperature=0
    )

    return response.choices[0].message.content
    

def extraction_log(response: str, filename: str='./data/logs/extraction_log_4.txt'):
    with open(filename, mode='a') as f:
        f.write(response + ',\n')


def generate_names_from_natural_sentence(filename: str, per_ask: int=10) -> pd.DataFrame:
    sentences = []
    with open(filename, mode='r') as f:
        sentences = f.readlines()

    sentences = [sentence.strip() for sentence in sentences]

    for index in range(len(sentences) // per_ask):
        sentences_to_send = sentences[index * per_ask : (index + 1) * per_ask]
        response = extract_entities_openai(sentences=sentences_to_send)
        print(response)
        extraction_log(response=response)

        sentences.append(response)

    sentences_to_send = sentences[(index + 1) * per_ask : (index + 2) * per_ask]
    if len(sentences_to_send) != 0:
        response = extract_entities_openai(sentences=sentences_to_send)
        print(response)
        extraction_log(response=response)

        sentences.append(response)

    return sentences

## Generating data

### Generating from the names

First we generate 'log.txt' file, in which there will be sentences. We generate save all responces in the file due to problems with ChatGPT API.

In [None]:
synthetic = generating_loop(iterations=40, sentence_num=5, names_dataset=data)

Then we extract all generated data and tokenize it. Then we tag all sentences and construct a dataframe from it.

In [9]:
synthetic = generate_tags('./data/logs/log.txt')

We save our final data to the '.csv' file.

In [10]:
synthetic.to_csv('./data/data_with_tags/synthetic_data.csv', index=False)

### Extracting names from the natural sentences

In [None]:
natural = generate_names_from_natural_sentence('./data/natural_samples_2.txt', per_ask=1) # Sent requests to the OpenAI API

In [32]:
natural = generate_tags('./data/logs/extraction_log_4.txt')

In [33]:
natural.to_csv('./data/data_with_tags/natural/natural_data_4.csv', index=False)

### Merging data from both sources

In [34]:
def merge_datasets(natural_folder: str, synthetic_folder: str):
    natural_paths = glob.glob('*.csv', root_dir=natural_folder)
    synthetic_paths = glob.glob('*.csv', root_dir=synthetic_folder)

    natural_paths = [os.path.join(natural_folder, path) for path in natural_paths]
    synthetic_paths = [os.path.join(synthetic_folder, path) for path in synthetic_paths]

    natural = pd.concat([pd.read_csv(path) for path in natural_paths], ignore_index=True)
    natural['source'] = 'HUMAN'

    synthetic = pd.concat([pd.read_csv(path) for path in synthetic_paths], ignore_index=True)
    synthetic['source'] = 'GPT'

    return pd.concat([natural, synthetic], ignore_index=True)

In [35]:
result = merge_datasets(natural_folder='./data/data_with_tags/natural/', synthetic_folder='./data/data_with_tags/synthetic/')

In [36]:
result.to_csv('./data/data_with_tags/fine-tune-test-data-test-2.csv', index=False)