In [34]:
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Utilities to simplify the canonical NQ data.

The canonical NQ data contains the HTML of each Wikipedia page along with a
sequence of tokens on that page, each of which is indexed into the HTML.

Many users will not want to use the HTML at all, and this file provides
utilities to extract only the text into a new record of the form:

  {
    "example_id": 3902,
    "document_url": "http://wikipedia.org/en/strings"
    "question_text": "what is a string",
    "document_text": "<P> A string is a list of characters in order . </P>",
    "annotations": [{
      "long_answer": { "start_token": 0, "end_token": 12 },
      "short_answers": [{ "start_token": 5, "end_token": 8 }],
      "yes_no_answer": "NONE",
    }],
    "long_answer_candidates": [
      {"start_token": 0, "end_token": 12, "top_level": True}
    ]
  }

which leads to a much smaller training set (4.4Gb instead of 41Gb).

In this representation, the [start, end) indices are into the blank separated
sequence of tokens. So, answer spans can be extracted using the following
snippet:

  " ".join(example["document_text"].split(" ")[`start_token`:`end_token`]).

WARNING: Use `split(" ")` instead of `split()` to avoid complications from
  characters such as `\u180e` which may or may not be recognized as a whitespace
  character depending on your python version.

To avoid complications at test time, we do not provide a simplified version
of the development data, and there is no simplified version of the hidden test
set. If you rely on the simplified data, then you must call the
`simplify_nq_example` function below on every example that is passed in at test
time.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import re


def get_nq_tokens(simplified_nq_example):
  """Returns list of blank separated tokens."""

  if "document_text" not in simplified_nq_example:
    raise ValueError("`get_nq_tokens` should be called on a simplified NQ"
                     "example that contains the `document_text` field.")

  return simplified_nq_example["document_text"].split(" ")


def simplify_nq_example(nq_example):
  r"""Returns dictionary with blank separated tokens in `document_text` field.

  Removes byte offsets from annotations, and removes `document_html` and
  `document_tokens` fields. All annotations in the ouput are represented as
  [start_token, end_token) offsets into the blank separated tokens in the
  `document_text` field.

  WARNING: Tokens are separated by a single blank character. Do not split on
    arbitrary whitespace since different implementations have different
    treatments of some unicode characters such as \u180e.

  Args:
    nq_example: Dictionary containing original NQ example fields.

  Returns:
    Dictionary containing `document_text` field, not containing
    `document_tokens` or `document_html`, and with all annotations represented
    as [`start_token`, `end_token`) offsets into the space separated sequence.
  """

  def _clean_token(token):
    """Returns token in which blanks are replaced with underscores.

    HTML table cell openers may contain blanks if they span multiple columns.
    There are also a very few unicode characters that are prepended with blanks.

    Args:
      token: Dictionary representation of token in original NQ format.

    Returns:
      String token.
    """
    return re.sub(u" ", "_", token["token"])

  text = " ".join([_clean_token(t) for t in nq_example["document_tokens"]])

  def _remove_html_byte_offsets(span):
    if "start_byte" in span:
      del span["start_byte"]

    if "end_byte" in span:
      del span["end_byte"]

    return span

  def _clean_annotation(annotation):
    annotation["long_answer"] = _remove_html_byte_offsets(
        annotation["long_answer"])
    annotation["short_answers"] = [
        _remove_html_byte_offsets(sa) for sa in annotation["short_answers"]
    ]
    return annotation

  simplified_nq_example = {
      "question_text": nq_example["question_text"],
      "example_id": nq_example["example_id"],
      "document_url": nq_example["document_url"],
      "document_text": text,
      "long_answer_candidates": [
          _remove_html_byte_offsets(c)
          for c in nq_example["long_answer_candidates"]
      ],
      "annotations": [_clean_annotation(a) for a in nq_example["annotations"]]
  }

  if len(get_nq_tokens(simplified_nq_example)) != len(
      nq_example["document_tokens"]):
    raise ValueError("Incorrect number of tokens.")

  return simplified_nq_example

In [85]:
from datasets import load_dataset
from pathlib import Path
from pprint import pprint
from datasets import load_dataset
import pandas as pd
import os
from dotenv import load_dotenv
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pandas
import openai
from openai import APIError
import os
import json
import re
import numpy as np
from sklearn.cluster import KMeans
from pprint import pprint
from pathlib import Path
import tiktoken


# Load the .env file
load_dotenv()

# Access the variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY



In [4]:


dir = Path("~/Datasets/SRBendding").expanduser()
ds = load_dataset("google-research-datasets/natural_questions", "dev", cache_dir=dir)

Downloading readme:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/186M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/197M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/191M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/191M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/193M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/7830 [00:00<?, ? examples/s]

In [5]:
ds

DatasetDict({
    validation: Dataset({
        features: ['id', 'document', 'question', 'long_answer_candidates', 'annotations'],
        num_rows: 7830
    })
})

In [14]:
vali = ds['validation']
vali

Dataset({
    features: ['id', 'document', 'question', 'long_answer_candidates', 'annotations'],
    num_rows: 7830
})

In [65]:
# vali[0]['document']['tokens']['start_byte']

In [66]:
# vali[0]['long_answer_candidates']["start_byte"]

In [70]:
vali[0]['annotations']['long_answer']

[{'start_token': 932,
  'end_token': 1016,
  'start_byte': 87119,
  'end_byte': 88080,
  'candidate_index': 66},
 {'start_token': -1,
  'end_token': -1,
  'start_byte': -1,
  'end_byte': -1,
  'candidate_index': -1},
 {'start_token': -1,
  'end_token': -1,
  'start_byte': -1,
  'end_byte': -1,
  'candidate_index': -1},
 {'start_token': 3301,
  'end_token': 3669,
  'start_byte': 118430,
  'end_byte': 121038,
  'candidate_index': 80},
 {'start_token': -1,
  'end_token': -1,
  'start_byte': -1,
  'end_byte': -1,
  'candidate_index': -1}]

In [67]:
# simplify_nq_example(vali[0])

In [76]:
result = []
for i in range(len(vali)):
    info = vali[i]
    id = info['id']
    context = ""
    # start_byte = info['long_answer_candidates']["start_byte"][0]
    # end_byte = info['long_answer_candidates']["end_byte"][0]
    start_byte = -1
    end_byte = -1
    long_answers = info['annotations']['long_answer']
    for j in range(len(long_answers)):
        if long_answers[j]["start_byte"] != -1:
            start_byte = long_answers[j]["start_byte"]
            end_byte = long_answers[j]["end_byte"]
            break
                
    token_element = info['document']['tokens']
    for j in range(len(token_element['token'])):
        if not token_element['is_html'][j] and start_byte <= token_element['start_byte'][j] <= end_byte:
            context += (token_element['token'][j]).strip() + " "
    context = context.strip()
    question = info['question']['text']
    current = {
        "query_id": id,
        "query": question,
        "passage_text": context,

    }
    result.append(current)

pprint(len(result))



7830


In [80]:
pprint(result[:5])

[{'passage_text': 'Through the work of Max Planck , Albert Einstein , Louis de '
                  'Broglie , Arthur Compton , Niels Bohr and many others , '
                  'current scientific theory holds that all particles also '
                  'have a wave nature ( and vice versa ) . This phenomenon has '
                  'been verified not only for elementary particles , but also '
                  'for compound particles like atoms and even molecules . For '
                  'macroscopic particles , because of their extremely short '
                  'wavelengths , wave properties usually can not be detected .',
  'query': 'who proposed that electrons behave like waves and particles',
  'query_id': '-5501481664893105662'},
 {'passage_text': 'The United States Senate consists of 100 members , two from '
                  'each of the 50 states . Below is a list of the current U.S. '
                  'Senators , sitting in the 115th United States Congress .',
  'query': '

In [81]:
def load_data_natural(dataset_name:str = "google-research-datasets/natural_questions"):
    dir = Path("~/Datasets/SRBendding").expanduser()
    data = load_dataset(dataset_name, "dev", cache_dir=dir)  
    validation_dataset = data['validation']

    result = []
    for i in range(len(validation_dataset) - 7820):
        record = validation_dataset[i]
        id = record['id']
        start_byte, end_byte = get_start_and_end_byte(record)
        context = make_context(record, start_byte, end_byte)
        question = record['question']['text']
        current = {
            "query_id": id,
            "query": question,
            "passage_text": [context],

        }
        result.append(current)

    return result

def make_context(info, start_byte, end_byte):
    context = ""    
    token_element = info['document']['tokens']
    for j in range(len(token_element['token'])):
        if not token_element['is_html'][j] and start_byte <= token_element['start_byte'][j] <= end_byte:
            context += (token_element['token'][j]).strip() + " "
    context = context.strip()
    return context

def get_start_and_end_byte(info):
    start_byte = -1
    end_byte = -1
    long_answers = info['annotations']['long_answer']
    for j in range(len(long_answers)):
        if long_answers[j]["start_byte"] != -1:
            start_byte = long_answers[j]["start_byte"]
            end_byte = long_answers[j]["end_byte"]
            break
    return start_byte,end_byte

In [82]:
res = load_data_natural()


Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

In [84]:
res

[{'query_id': '-5501481664893105662',
  'query': 'who proposed that electrons behave like waves and particles',
  'passage_text': ['Through the work of Max Planck , Albert Einstein , Louis de Broglie , Arthur Compton , Niels Bohr and many others , current scientific theory holds that all particles also have a wave nature ( and vice versa ) . This phenomenon has been verified not only for elementary particles , but also for compound particles like atoms and even molecules . For macroscopic particles , because of their extremely short wavelengths , wave properties usually can not be detected .']},
 {'query_id': '8594030243394572667',
  'query': 'how many senators are there in the us senate',
  'passage_text': ['The United States Senate consists of 100 members , two from each of the 50 states . Below is a list of the current U.S. Senators , sitting in the 115th United States Congress .']},
 {'query_id': '-716176363900512091',
  'query': 'phase change from gas to solid is called',
  'passa

In [78]:
count = 0
for res in result:
    if len(res['passage_text']) < 10:
        count += 1
        print(res['passage_text'])
print(count)











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [87]:
loaded_table = pq.read_table('datasets/msmarco.parquet')
df = loaded_table.to_pandas()

In [113]:
df['id']

0    7
1    8
2    0
3    4
4    3
5    6
6    2
7    5
8    1
9    9
Name: id, dtype: int64

In [114]:
loaded_table = pq.read_table('datasets/naquestions.parquet')
df_natural_language = loaded_table.to_pandas()

In [115]:
df_natural_language

Unnamed: 0,id,query,passage_text
0,8594030243394572667,koliko senatora ima u Senatu SAD-a,[Senat Sjedinjenih Američkih Država se sastoji...
1,-716176363900512091,Kako se naziva promena faze iz gasa u čvrsto s...,"[Depozicija je termodinamički proces, faza pre..."
2,-5501481664893105662,ko je predložio da se elektroni ponašaju kao t...,"[Kroz rad Maks Planka, Alberta Einštajna, Luj ..."
3,-5593761806871287545,koja je najduža engleska reč u rečniku,[Najduža reč u bilo kom od glavnih engleskih j...
4,2074579308847994444,koliki procenat Zemljine površine je voda,[Voda je raspoređena širom Zemlje. Većina vode...
5,-774521643378360615,koja je definicija pH vrednosti u vodi,"[U hemiji, pH ( / piː ( h ) iː / ) (potencijal..."
6,-4418682909598136345,Odakle potiče supa od gnezda ptica?,[Jesti gnezda ptica su gnezda ptica koja stvar...


In [91]:
type(df['passage_text'][0])

numpy.ndarray

## Testing GPT Response

In [95]:
response_path = Path("./translation_pipeline_test/naquestions_test_results.jsonl")
assert response_path.exists(), f"File not found: {response_path}"

In [109]:
responses = []
with open(response_path, "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        data = json.loads(line)
        request = data[0]
        response = data[1]['choices'][0]['message']['content']
        try:
            response_json = json.loads(response)
            responses.append(response_json)
        except json.JSONDecodeError as e:
            error_data = response
            print(f"Index: {i}, Error: {e}")

Index: 7, Error: Expecting ',' delimiter: line 5 column 1461 (char 1583)


In [111]:
print(error_data)

{
    "translations": [
        {
            "query": "ko ima najviše nagrada za najkorisnijeg igrača All-Star utakmica",
            "passage_text": ["Bob Pettit i Kobe Bryant su jedini igrači koji su osvojili nagradu za najkorisnijeg igrača All-Star utakmica četiri puta. Oscar Robertson, Majkl Džordan, Šekil O'Nil i Lebron Džejms su svaki osvojili nagradu tri puta, dok su Bob Cousy, Džulijus Erving, Azejja Tomas, Međik Džonson, Karl Meloun, Alen Ajverson i Rasel Vestbruk svi osvojili nagradu dva puta. Džejmsova prva nagrada za najkorisnijeg igrača All-Star utakmica 2006. godine učinila ga je najmlađim ikada koji je osvojio nagradu sa 21 godinom i 1 mesecom. Kajri Irving, pobednik nagrade za najkorisnijeg igrača All-Star utakmica 2014. godine, drugi je najmlađi sa 21 godinom i 10 meseci. Oni su značajni jer su najmlađi koji su osvojili nagradu, obojica kao igrači Klivlenda. Četiri puta su postojali zajednički pobednici - Elgin Bejlor i Pettit 1959. godine, Džon Stokton i Meloun 1993.

In [108]:
[{"model": "gpt-3.5-turbo-0125", "response_format": {"type": "json_object"}, "temperature": 0, "messages": [{"role": "system", "content": "\n***TRANSLATION FROM ENGLISH TO SERBIAN***\n\n**GOALS**\n\nYou are a professional translator fluent in English and Serbian. \nYour primary goal is to produce a high-quality, natural-sounding translation from English to Serbian. \nYou are translating texts and questions pertaining to the texts. The translation is intended for dataset creation. Look at the example below:\n\n***TRANSLATION EXAMPLE***\n***ENGLISH***\nquery: 'What is a unicorn?'\npassage_text: 'The unicorn is a legendary creature that has been described since antiquity as a beast with a single large, pointed, spiraling horn projecting from its forehead.'\n\n***SERBIAN TRANSLATION***\nquery: 'Šta je jednorog?'\npassage_text: 'Jednorog je mitsko stvorenje koje se od davnina opisuje kao zver sa jednim velikim, šiljastim, spiralnim rogom koji mu viri iz čela.'\n***END OF TRANSLATION EXAMPLE***\n\nTo translate, follow the steps below:\n   **TRANSLATION INSTRUCTIONS**\n   1. Read and understand the sentence in English.\n   2. When you understand the English sentence, start to translate.\n   3. Pay close attention to both left and right context when you are making translation decisions.  4. Convey the original context, tone and meaning in the Serbian translation.\n   4. Avoid literal translations and ensure the output reads naturally in Serbian.\n   5. The translation must be contextually accurate, fluent, and adhere to the grammatical rules and lexicon of the Serbian language.\n   6. The declination of nouns, adjectives, and pronouns must be correct.\n   7. Make sure to proofread the translated text in Serbian and revise any mistakes. If no revisions are needed, provide the translations as they are.\n\n   **FORMATTING INSTRUCTIONS**\n   1. Strings should be enclosed within single quotation marks ('').\n   2. Use double quotes for strings and escape internal quotes with a backslashes (\\).\n \n   **OUTPUT FORMATTING**\n   - Ensure the output is a valid JSON file, parsable by Python's json.loads().\n   - Ensure consistent JSON formatting as illustrated in the example below:\n\n      **EXAMPLE**\n      {\"translations\" : [{\"query\" : 'This is a query',\n                         \"passage_text\" : ['This is one passage. With another sentence.',\n                                          'This is yet another passage. With yet another sentence.']\n                        }\n                        ]\n      }\n       **END OF EXAMPLE**\n\n   - Strictly follow the structure provided in the example when generating the output.   \n   - Make sure to translate text under both \"query\" and \"passage_text\" keys.\n\n"}, {"role": "user", "content": "{\"query_id\": \"4245798066923223457\", \"query\": \"who has the most all star mvp awards\", \"passage_text\": [\"Bob Pettit and Kobe Bryant are the only two players to win the All - Star Game MVP four times . Oscar Robertson , Michael Jordan , Shaquille O'Neal , and LeBron James have each won the award three times , while Bob Cousy , Julius Erving , Isiah Thomas , Magic Johnson , Karl Malone , Allen Iverson , and Russell Westbrook have all won the award twice . James ' first All - Star MVP in 2006 made him the youngest to have ever won the award at the age of 21 years , 1 month . Kyrie Irving , winner of the 2014 All - Star Game MVP , is the second - youngest at 21 years , 10 months . They are notable as being the two youngest to win the award , both as Cleveland Cavaliers . Four of the games had joint winners -- Elgin Baylor and Pettit in 1959 , John Stockton and Malone in 1993 , O'Neal and Tim Duncan in 2000 , and O'Neal and Bryant in 2009 . O'Neal became the first player in All - Star history to share two MVP awards as well as the first player to win the award with multiple teams . The Los Angeles Lakers have had eleven winners while the Boston Celtics have had eight . Duncan of the U.S. Virgin Islands and Irving of Australia are the only winners not born in the United States . Both Duncan and Irving are American citizens , but are considered `` international '' players by the NBA because they were not born in one of the fifty states or Washington , D.C. No player trained entirely outside the U.S. has won the award ; Irving lived in the U.S. since age two , and Duncan played U.S. college basketball at Wake Forest .\"]}"}]}, {"id": "chatcmpl-9yelcepUFNOTyrdTQR677DyHkQ49b", "object": "chat.completion", "created": 1724243416, "model": "gpt-3.5-turbo-0125", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n    \"translations\": [\n        {\n            \"query\": \"ko ima najviše nagrada za najkorisnijeg igrača All-Star utakmica\",\n            \"passage_text\": [\"Bob Pettit i Kobe Bryant su jedini igrači koji su osvojili nagradu za najkorisnijeg igrača All-Star utakmica četiri puta. Oscar Robertson, Majkl Džordan, Šekil O'Nil i Lebron Džejms su svaki osvojili nagradu tri puta, dok su Bob Cousy, Džulijus Erving, Azejja Tomas, Međik Džonson, Karl Meloun, Alen Ajverson i Rasel Vestbruk svi osvojili nagradu dva puta. Džejmsova prva nagrada za najkorisnijeg igrača All-Star utakmica 2006. godine učinila ga je najmlađim ikada koji je osvojio nagradu sa 21 godinom i 1 mesecom. Kajri Irving, pobednik nagrade za najkorisnijeg igrača All-Star utakmica 2014. godine, drugi je najmlađi sa 21 godinom i 10 meseci. Oni su značajni jer su najmlađi koji su osvojili nagradu, obojica kao igrači Klivlenda. Četiri puta su postojali zajednički pobednici - Elgin Bejlor i Pettit 1959. godine, Džon Stokton i Meloun 1993. godine, O'Nil i Tim Dankan 2000. godine, i O'Nil i Brajant 2009. godine. O'Nil je postao prvi igrač u istoriji All-Star utakmica koji je podelio dve MVP nagrade, kao i prvi igrač koji je osvojio nagradu sa više timova. Los Anđeles Lejkersi imaju jedanaest pobednika dok Boston Seltiksi imaju osam. Dankan sa Devičanskih Ostrva i Irving iz Australije su jedini pobednici koji nisu rođeni u Sjedinjenim Američkim Državama. I Dankan i Irving su američki državljani, ali ih NBA smatra \"                                                                                                    ", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 967, "completion_tokens": 667, "total_tokens": 1634}, "system_fingerprint": null}, {"id": "4245798066923223457"}]


NameError: name 'null' is not defined