In [14]:
import json
import os
import numpy as np
import openai
import tiktoken
import pandas as pd

from dotenv import load_dotenv
from typing import Any, Dict, List, Tuple
from datasets import load_dataset

### Wiki

In [15]:
def get_data(path):
    dataset = pd.read_parquet(path)
    text = dataset['context'].values.tolist()
    short = dataset['short_query'].values.tolist()
    medium = dataset['medium_query'].values.tolist()
    long = dataset['long_query'].values.tolist()

    print(text[0])
    print(len(text))
    print(short[0])
    print(len(short))
    print(medium[0])
    print(len(medium))
    print(long[0])
    print(len(long))


    return text, short, medium, long

wiki_path = '../datasets/datasets/wiki_fixed.parquet'
wiki_text, wiki_short, wiki_medium, wiki_long = get_data(wiki_path)

UTF-8 UTF-8 varijanta je najzgodnija za kodiranje većinski latiničnog teksta Dato je i kratko uputstvo za korišćenje te varijante u Microsoft Word-u, Netscape Composer-u i tekstualnom editoru Kate U tekstu su takođe preporučeni standardni Unicode fontovi koji omogućavaju laku prenosivost teksta sa računara na računar ili za objavljivanje teksta na Internet Prvi računari su bili pravljeni pretežno za englesko govorno područje i imali su podršku samo za engleski alfabet, za brojeve, zagrade i još po neki kontrolni karakter, što je činilo ukupno 128 mogućih slova (u 7 bita)
7444
Šta omogućava UTF-8 varijanta?
7444
Kako se prenosi tekst sa računara na računar koristeći standardne Unicode fontove?
7444
Kako je podrška za engleski alfabet evoluirala sa prvim računarima i šta je činilo ukupno 128 mogućih slova?
7444


In [16]:
def tokenize_and_count_length(texts, model="text-embedding-3-small"):
    tokenized_texts = []
    token_counts = []
    tokenizer = tiktoken.encoding_for_model(model)
    
    for text in texts:
        tokenized_text = tokenizer.encode(text)
        tokenized_texts.append(tokenized_text)

    for i, text in enumerate(tokenized_texts):
        token_count = len(text)      
        if token_count < 4:
            print(texts[i]) 
        token_counts.append(token_count)
    
    filtered_counts = [count for count in token_counts if count > 0]

    mean = np.mean(token_counts)
    std = np.std(token_counts)
    maximum = np.max(token_counts)

    if len(filtered_counts) >= 2:
        sorted_counts = sorted(set(filtered_counts))
        minimum = sorted_counts[0]
        second_minimum = sorted_counts[1]
    elif len(filtered_counts) == 1:
        minimum = second_minimum = filtered_counts[0]  # Only one value, so both min and second min are the same
    else:
        minimum = second_minimum = 0

    return tokenized_texts, token_counts, mean, std, maximum, minimum, second_minimum

In [17]:
token_wiki_text, count_wiki_text, mean_wiki_text, std_wiki_text, max_wiki_text, min_wiki_text, second_min_wiki_text = tokenize_and_count_length(wiki_text)

print(token_wiki_text[:100])
print(len(token_wiki_text))
print(count_wiki_text)
print(f"Mean: {mean_wiki_text}")
print(f"STD: {std_wiki_text}")
print(f"Maximal text length in tokens: {max_wiki_text}")
print(f"Minimal text length in tokens: {min_wiki_text}")
print(f"Second smallest length: {second_min_wiki_text} ")

[[8729, 12, 23, 20677, 12, 23, 767, 3251, 8424, 4864, 30274, 89, 40811, 77, 29230, 15036, 66632, 55790, 3841, 5320, 7886, 53977, 6987, 6729, 13453, 77, 540, 43185, 21127, 423, 4428, 4864, 602, 92114, 9509, 709, 332, 267, 3415, 15036, 597, 13915, 11906, 7886, 268, 3841, 1028, 767, 3251, 5048, 577, 5210, 9506, 46481, 11, 59487, 5443, 79871, 46481, 602, 73678, 940, 17101, 6576, 84, 30301, 549, 43185, 61782, 924, 98133, 52096, 68, 864, 4406, 84, 13453, 34229, 5410, 7907, 36997, 3381, 46188, 60471, 8019, 540, 84, 7886, 402, 52513, 326, 24468, 864, 37491, 344, 537, 43185, 21127, 829, 15473, 13453, 359, 5169, 4415, 15473, 13453, 86092, 60409, 15036, 2909, 402, 53835, 28422, 3841, 43185, 21127, 4415, 8191, 2394, 10176, 15473, 13453, 359, 2850, 924, 293, 4008, 550, 68411, 24041, 72, 864, 668, 12453, 2201, 15036, 2995, 645, 9509, 48725, 11368, 7661, 2739, 13453, 3841, 602, 737, 8115, 924, 43015, 11906, 12407, 83478, 15036, 2995, 645, 6780, 85766, 10448, 11, 15036, 2967, 3841, 588, 11, 90119, 501

In [18]:
token_wiki_short, count_wiki_short, mean_wiki_short, std_wiki_short, max_wiki_short, min_wiki_short, second_min_wiki_short = tokenize_and_count_length(wiki_short)

print(token_wiki_short[:100])
print(len(token_wiki_short))
print(count_wiki_short)
print(f"Mean: {mean_wiki_short}")
print(f"STD: {std_wiki_short}")
print(f"Maximal text length in tokens: {max_wiki_short}")
print(f"Minimal text length in tokens: {min_wiki_short}")
print(f"Second smallest length: {second_min_wiki_short} ")

[[129, 254, 2629, 8019, 540, 84, 7886, 2979, 20677, 12, 23, 767, 3251, 8424, 30], [129, 254, 2629, 4864, 40416, 5410, 30], [42, 78, 3841, 924, 66632, 818, 610, 2194, 15036, 220, 7886, 404, 321, 292, 84, 602, 6987, 6729, 13453, 3458, 281, 39252, 30], [129, 254, 2629, 4864, 80346, 12, 17, 30], [42, 78, 3841, 2559, 42441, 43015, 12453, 2979, 15219, 24078, 21372, 30], [129, 254, 2629, 4864, 2709, 39142, 3458, 24788, 747, 4657, 22924, 2727, 52096, 84, 20677, 12, 22, 602, 20677, 12, 23, 30], [129, 254, 2629, 4864, 6747, 35973, 3645, 36997, 5410, 64, 30], [129, 254, 2629, 924, 757, 87911, 13453, 6780, 3260, 4021, 72, 15473, 13453, 359, 2850, 30], [42, 337, 24551, 4602, 32376, 924, 16947, 89, 372, 404, 8115, 4113, 7907, 15473, 13453, 359, 2850, 30], [129, 254, 2629, 924, 577, 33050, 52096, 34229, 15473, 13453, 359, 2850, 30], [42, 29886, 513, 312, 11906, 402, 52513, 71748, 9491, 3575, 72, 4415, 15473, 13453, 86092, 7675, 30], [129, 254, 2629, 924, 15473, 13453, 359, 2850, 864, 1764, 43185, 617

In [19]:
token_wiki_medium, count_wiki_medium, mean_wiki_medium, std_wiki_medium, max_wiki_medium, min_wiki_medium, second_min_wiki_medium = tokenize_and_count_length(wiki_medium)

print(token_wiki_medium[:100])
print(len(token_wiki_medium))
print(count_wiki_medium)
print(f"Mean: {mean_wiki_medium}")
print(f"STD: {std_wiki_medium}")
print(f"Maximal text length in tokens: {max_wiki_medium}")
print(f"Minimal text length in tokens: {min_wiki_medium}")
print(f"Second smallest length: {second_min_wiki_medium} ")

[[42, 29886, 513, 864, 77, 31824, 73678, 829, 15473, 13453, 359, 5169, 4415, 15473, 13453, 86092, 33054, 17194, 96881, 5410, 818, 36997, 3381, 1009, 30], [42, 29886, 924, 66632, 818, 610, 2194, 43015, 12453, 402, 1604, 3355, 58067, 4864, 89, 11755, 4415, 4443, 57851, 15473, 13453, 86092, 7675, 30], [42, 78, 3841, 924, 2709, 39142, 818, 66632, 818, 610, 2194, 15036, 6987, 6729, 13453, 3458, 602, 220, 7886, 404, 4008, 13453, 3458, 281, 39252, 577, 10641, 897, 72, 602, 97599, 2560, 2727, 27262, 84, 22924, 1394, 85, 9491, 30], [42, 29886, 513, 312, 11906, 2979, 3575, 54266, 74, 582, 32376, 36321, 6347, 15036, 36997, 4247, 36834, 4415, 1812, 3251, 84309, 60471, 513, 33054, 62617, 30], [42, 78, 3841, 924, 2322, 536, 66632, 55790, 5697, 597, 13915, 11906, 7886, 1994, 15036, 864, 37491, 3355, 11906, 3141, 275, 87165, 312, 13453, 72, 864, 9509, 8191, 8232, 7561, 30], [42, 29886, 513, 91855, 72, 1167, 14041, 73401, 577, 20677, 12, 22, 602, 20677, 12, 23, 5410, 7675, 36997, 7561, 602, 97599, 924,

In [20]:
token_wiki_long, count_wiki_long, mean_wiki_long, std_wiki_long, max_wiki_long, min_wiki_long, second_min_wiki_long = tokenize_and_count_length(wiki_long)

print(token_wiki_long[:100])
print(len(token_wiki_long))
print(count_wiki_long)
print(f"Mean: {mean_wiki_long}")
print(f"STD: {std_wiki_long}")
print(f"Maximal text length in tokens: {max_wiki_long}")
print(f"Minimal text length in tokens: {min_wiki_long}")
print(f"Second smallest length: {second_min_wiki_long} ")

[[42, 29886, 4864, 43015, 11906, 4657, 15036, 2995, 645, 6780, 85766, 10448, 3721, 337, 21581, 6181, 829, 550, 42294, 15473, 13453, 86092, 7675, 602, 37524, 2629, 4864, 33902, 258, 18536, 15012, 455, 2201, 220, 4386, 27262, 84, 7886, 7141, 18067, 6723, 30], [42, 29886, 4864, 463, 11906, 556, 7907, 40416, 5410, 220, 5162, 23, 13, 10087, 483, 8019, 540, 84, 7886, 822, 32189, 266, 818, 91855, 68, 602, 43015, 11906, 12407, 15036, 3355, 58067, 4864, 89, 11755, 4415, 4443, 57851, 15473, 13453, 86092, 7675, 30], [42, 78, 3841, 924, 1424, 13215, 13453, 818, 66632, 818, 610, 2194, 2477, 332, 5632, 12, 6549, 16, 11, 5632, 12, 6549, 15, 11, 38612, 40, 23, 11151, 602, 22705, 12, 19445, 24, 12, 20, 15036, 6987, 6729, 13453, 3458, 602, 220, 7886, 404, 4008, 13453, 3458, 281, 39252, 11, 602, 91617, 513, 757, 52096, 355, 677, 2201, 374, 10784, 8783, 13453, 73401, 577, 87061, 7675, 30], [42, 78, 3841, 924, 4255, 74952, 602, 53460, 597, 13915, 11906, 7886, 268, 5697, 80346, 12, 17, 11, 20677, 12, 845, 6

### News

In [21]:
news_path = "../datasets/datasets/news.parquet"
news_text, news_short, news_medium, news_long = get_data(news_path)

Mečka na usijanom limenom krovu Piše: Dušan Vidaković Kada je austrijski konzul u Nici Emil Jelinek 1901 naručio od nemačkog Dajmlera, odjednom, čak 36 automobila, čija je ukupna cena premašila fantastičnih 550 hiljada zlatnih maraka, imao je samo jedan uslov: da sva ta kola nose ime njegove ćerke Ljudi iz kanstatske firme, naravno, nisu imali ništa protiv, a istorija automobilizma je imala sreću da je Jelinekovoj mezimici kum dao (pogotovo za Špance) vrlo graciozno ime - Mercedes, što na jeziku konkvistadora znaci ljupkost, umilnost, prijatnost
1750
Ko je naručio 36 automobila 1901. godine?
1750
Ko je bio austrijski konzul u Nici koji je naručio 36 automobila od Dajmlera 1901. godine?
1750
Ko je bio austrijski konzul u Nici koji je 1901. godine naručio 36 automobila od Dajmlera, čija je ukupna cena premašila 550 hiljada zlatnih maraka, sa uslovom da sva kola nose ime njegove ćerke Mercedes?
1750


In [22]:
token_news_text, count_news_text, mean_news_text, std_news_text, max_news_text, min_news_text, second_min_news_text = tokenize_and_count_length(news_text)

print(token_news_text[:100])
print(len(token_news_text))
print(count_news_text)
print(f"Mean: {mean_news_text}")
print(f"STD: {std_news_text}")
print(f"Maximal text length in tokens: {max_news_text}")
print(f"Minimal text length in tokens: {min_news_text}")
print(f"Second smallest length: {second_min_news_text} ")

[[7979, 13453, 4657, 4415, 603, 3251, 276, 316, 4671, 268, 316, 597, 40494, 84, 21286, 58067, 25, 16062, 11906, 276, 48654, 587, 46188, 7886, 735, 2649, 4864, 100206, 462, 2580, 6780, 16947, 89, 360, 577, 452, 3457, 88824, 622, 4939, 74, 220, 7028, 16, 308, 31244, 13453, 822, 11018, 308, 9355, 13453, 74, 540, 423, 1662, 1029, 2473, 11, 11018, 73, 291, 17101, 11, 33902, 587, 220, 1927, 5113, 677, 10746, 11, 33902, 29230, 4864, 15012, 455, 3458, 79006, 864, 1764, 11906, 10746, 64979, 84195, 87165, 220, 13506, 52427, 73, 2649, 1167, 5641, 87165, 3678, 13637, 11, 737, 3524, 4864, 83478, 18806, 276, 603, 36247, 25, 3067, 274, 6723, 9637, 597, 8083, 19689, 75534, 37185, 797, 1009, 220, 7886, 261, 441, 445, 18631, 72, 22924, 13728, 16658, 441, 35092, 2727, 11, 44669, 402, 2201, 11, 308, 63848, 737, 8115, 13080, 11906, 2629, 1760, 344, 11, 264, 220, 5436, 29230, 5113, 29029, 450, 1764, 4864, 737, 6181, 274, 265, 7886, 84, 3067, 4864, 622, 4939, 52767, 21963, 77885, 318, 3457, 597, 372, 25810, 

In [23]:
token_news_short, count_news_short, mean_news_short, std_news_short, max_news_short, min_news_short, second_min_news_short = tokenize_and_count_length(news_short)

print(token_news_short[:100])
print(len(token_news_short))
print(count_news_short)
print(f"Mean: {mean_news_short}")
print(f"STD: {std_news_short}")
print(f"Maximal text length in tokens: {max_news_short}")
print(f"Minimal text length in tokens: {min_news_short}")
print(f"Second smallest length: {second_min_news_short} ")

[[42, 78, 4864, 308, 31244, 13453, 822, 220, 1927, 5113, 677, 10746, 220, 7028, 16, 13, 10087, 483, 30], [42, 587, 402, 4864, 43839, 85003, 4809, 2041, 23174, 30], [129, 254, 2629, 4864, 21317, 3711, 30], [42, 78, 3841, 3937, 461, 34328, 33054, 62617, 577, 1646, 84, 362, 20839, 521, 30], [42, 337, 24551, 44987, 5169, 4864, 26413, 822, 4809, 26785, 597, 14833, 316, 362, 23857, 42176, 30], [42, 78, 3841, 924, 1153, 839, 560, 308, 288, 265, 56494, 829, 34328, 316, 362, 20839, 521, 30], [42, 78, 4864, 41821, 16023, 577, 281, 16279, 7886, 75455, 540, 757, 87911, 13453, 441, 53460, 30], [42, 78, 4864, 603, 375, 78, 3067, 15036, 18843, 52096, 68, 3285, 77, 540, 86088, 7304, 30], [42, 78, 4864, 577, 10784, 8783, 13453, 268, 577, 17774, 66, 64274, 1536, 77, 1009, 281, 2017, 268, 5697, 577, 82986, 598, 12407, 27276, 32557, 30], [42, 78, 3841, 91855, 380, 3043, 93793, 362, 597, 14833, 64, 30], [129, 254, 2629, 4864, 48491, 715, 1168, 73, 7404, 30], [129, 254, 2629, 4864, 11412, 12453, 2201, 15036

In [24]:
token_news_medium, count_news_medium, mean_news_medium, std_news_medium, max_news_medium, min_news_medium, second_min_news_medium = tokenize_and_count_length(news_medium)

print(token_news_medium[:100])
print(len(token_news_medium))
print(count_news_medium)
print(f"Mean: {mean_news_medium}")
print(f"STD: {std_news_medium}")
print(f"Maximal text length in tokens: {max_news_medium}")
print(f"Minimal text length in tokens: {min_news_medium}")
print(f"Second smallest length: {second_min_news_medium} ")

[[42, 78, 4864, 17332, 100206, 462, 2580, 6780, 16947, 89, 360, 577, 452, 3457, 60471, 4864, 308, 31244, 13453, 822, 220, 1927, 5113, 677, 10746, 11018, 423, 1662, 1029, 2473, 220, 7028, 16, 13, 10087, 483, 30], [42, 78, 5697, 4864, 2531, 9895, 297, 1529, 380, 402, 71779, 8348, 10056, 731, 1167, 19610, 451, 656, 12453, 344, 8458, 25616, 49938, 73, 9110, 463, 424, 84, 30], [42, 78, 5697, 11129, 857, 29230, 4864, 463, 450, 899, 64, 5113, 29029, 13453, 68929, 7941, 602, 597, 2649, 4864, 3273, 13453, 8458, 463, 450, 92110, 77, 5697, 30], [42, 78, 3841, 924, 91855, 380, 3043, 3937, 6347, 577, 34328, 869, 21963, 362, 20839, 10426, 602, 48219, 3043, 924, 5213, 17007, 32376, 25568, 540, 1646, 64, 30], [42, 337, 24551, 4864, 44987, 5169, 4809, 26785, 597, 14833, 316, 362, 26413, 822, 9825, 316, 294, 6723, 10087, 72, 11906, 77, 5697, 656, 4749, 23857, 13665, 587, 85, 7141, 42176, 30], [42, 78, 4864, 32262, 404, 3524, 4415, 23634, 13052, 74952, 30274, 39142, 3251, 797, 8930, 2041, 23174, 362, 208

In [25]:
token_news_long, count_news_long, mean_news_long, std_news_long, max_news_long, min_news_long, second_min_news_long = tokenize_and_count_length(news_long)

print(token_news_long[:100])
print(len(token_news_long))
print(count_news_long)
print(f"Mean: {mean_news_long}")
print(f"STD: {std_news_long}")
print(f"Maximal text length in tokens: {max_news_long}")
print(f"Minimal text length in tokens: {min_news_long}")
print(f"Second smallest length: {second_min_news_long} ")

[[42, 78, 4864, 17332, 100206, 462, 2580, 6780, 16947, 89, 360, 577, 452, 3457, 60471, 4864, 220, 7028, 16, 13, 10087, 483, 308, 31244, 13453, 822, 220, 1927, 5113, 677, 10746, 11018, 423, 1662, 1029, 2473, 11, 33902, 29230, 4864, 15012, 455, 3458, 79006, 864, 1764, 11906, 10746, 220, 13506, 52427, 73, 2649, 1167, 5641, 87165, 3678, 13637, 11, 829, 603, 36247, 316, 3067, 274, 6723, 597, 8083, 19689, 75534, 37185, 797, 1009, 220, 7886, 261, 441, 34328, 30], [42, 78, 4864, 17332, 4415, 33902, 79378, 6935, 64, 610, 84, 13453, 57208, 13637, 60471, 4864, 597, 1466, 24012, 577, 357, 959, 86057, 1803, 85003, 18806, 2201, 93471, 265, 1083, 4991, 7141, 85564, 72, 2922, 318, 5308, 2259, 15036, 34328, 30], [42, 587, 588, 91855, 380, 3043, 93793, 13726, 12, 1055, 64, 829, 2092, 1923, 316, 293, 977, 3251, 316, 602, 15593, 73, 318, 5113, 29029, 7675, 4353, 4749, 3067, 1370, 9008, 30], [42, 587, 588, 924, 1424, 13215, 13453, 74952, 3937, 6347, 577, 34328, 869, 21963, 362, 20839, 10426, 602, 91617, 51

### Science

In [26]:
sci_path = "../datasets/datasets/science.parquet"
sci_text, sci_short, sci_medium, sci_long = get_data(sci_path)

Uticaj brzine vazduha na aerosole formirane od 1 % emulzija 112 Uticaj brzine vazduha na aerosole formirane od 6 % emulzija 119 Uticaj brzine vazduha na aerosole formirane od 10 % emulzija 128 Strujanje aerosola vode u cevi Završna razmatranja strujanja aerosola u cevi formiranog emulzije EO3 Moguće strukture aerosol čestice NaCl: a) neporozna čvrsta, b) porozna čvrsta, c) čvrsta sa otvorenim džepovima rastovrenog NaCl, d) čvrsta čaura sa vodenim jezgrom, e) čvrsta NaCl čestica sa jezgrom vodenog rastvora NaCl, f) vodena kapljica NaCl. Linije na slikama predstavljaju adsorbovanu vodu. Atomiziranje centrifugalnom silom kod rotirajućieg atomizer „rotary atomizer“
2095
Kako se formiraju aerosoli od emulzija?
2095
Kako brzina vazduha utiče na formiranje aerosola od različitih emulzija i strukture čestica NaCl?
2095
Kako se vrši atomiziranje centrifugalnom silom kod rotirajućeg atomizera i kakve strukture aerosol čestica NaCl su moguće?
2095


In [27]:
token_sci_text, count_sci_text, mean_sci_text, std_sci_text, max_sci_text, min_sci_text, second_min_sci_text = tokenize_and_count_length(sci_text)

print(token_sci_text[:100])
print(len(token_sci_text))
print(count_sci_text)
print(f"Mean: {mean_sci_text}")
print(f"STD: {std_sci_text}")
print(f"Maximal text length in tokens: {max_sci_text}")
print(f"Minimal text length in tokens: {min_sci_text}")
print(f"Second smallest length: {second_min_sci_text} ")

[[52, 29150, 1662, 1437, 89, 483, 348, 1394, 1072, 4317, 4415, 49825, 1286, 1376, 404, 2194, 11018, 220, 16, 1034, 991, 360, 89, 29230, 220, 7261, 17578, 292, 1662, 1437, 89, 483, 348, 1394, 1072, 4317, 4415, 49825, 1286, 1376, 404, 2194, 11018, 220, 21, 1034, 991, 360, 89, 29230, 220, 9079, 17578, 292, 1662, 1437, 89, 483, 348, 1394, 1072, 4317, 4415, 49825, 1286, 1376, 404, 2194, 11018, 220, 605, 1034, 991, 360, 89, 29230, 220, 4386, 4610, 9832, 86057, 49825, 8083, 348, 536, 577, 3846, 10176, 1901, 70684, 11906, 3458, 24788, 8637, 6713, 5697, 610, 9832, 95051, 49825, 8083, 577, 3846, 10176, 1376, 55790, 540, 991, 360, 89, 32376, 93626, 18, 61021, 84, 56494, 610, 3178, 5081, 49825, 337, 33902, 478, 560, 13106, 5176, 25, 264, 8, 92634, 269, 9700, 3458, 33902, 19456, 21127, 11, 293, 8, 4247, 9700, 3458, 33902, 19456, 21127, 11, 272, 8, 33902, 19456, 21127, 829, 14479, 85, 24568, 318, 294, 12453, 752, 869, 7675, 436, 561, 869, 1466, 540, 13106, 5176, 11, 294, 8, 33902, 19456, 21127, 3390

In [28]:
token_sci_short, count_sci_short, mean_sci_short, std_sci_short, max_sci_short, min_sci_short, second_min_sci_short = tokenize_and_count_length(sci_short)

print(token_sci_short[:100])
print(len(token_sci_short))
print(count_news_short)
print(f"Mean: {mean_sci_short}")
print(f"STD: {std_sci_short}")
print(f"Maximal text length in tokens: {max_sci_short}")
print(f"Minimal text length in tokens: {min_sci_short}")
print(f"Second smallest length: {second_min_sci_short} ")

[[42, 29886, 513, 1376, 404, 52513, 49825, 14559, 11018, 991, 360, 89, 29230, 30], [129, 254, 2629, 924, 49825, 14559, 30], [129, 254, 2629, 4255, 267, 68411, 5697, 1200, 300, 32557, 3273, 1167, 3696, 68411, 3841, 9038, 90165, 30], [42, 78, 3841, 67892, 461, 297, 6723, 834, 531, 85003, 6127, 969, 12453, 34967, 30], [129, 254, 2629, 924, 2840, 402, 7907, 45576, 299, 13453, 77, 3457, 453, 2431, 72, 2580, 6780, 71, 312, 587, 66, 29230, 66632, 326, 18631, 72, 30], [42, 29886, 49825, 14559, 8791, 84195, 84, 4415, 597, 4763, 84, 30], [42, 78, 5697, 1028, 25105, 11755, 4809, 268, 5697, 513, 33054, 62617, 15036, 45103, 7886, 268, 3841, 12703, 81, 12453, 19580, 42236, 425, 30], [129, 254, 2629, 4864, 13871, 81, 4317, 6127, 969, 12453, 28422, 5697, 30], [129, 254, 2629, 4864, 4255, 4150, 6127, 969, 12453, 28422, 5697, 49825, 8083, 829, 834, 716, 70, 43757, 316, 33902, 19456, 267, 316, 38169, 316, 30], [42, 29886, 513, 577, 10784, 53191, 52513, 33902, 19456, 5455, 33902, 478, 560, 30], [129, 254,

In [29]:
token_sci_medium, count_sci_medium, mean_sci_medium, std_sci_medium, max_sci_medium, min_sci_medium, second_min_sci_medium = tokenize_and_count_length(sci_medium)

print(token_sci_medium[:100])
print(len(token_sci_medium))
print(count_sci_medium)
print(f"Mean: {mean_sci_medium}")
print(f"STD: {std_sci_medium}")
print(f"Maximal text length in tokens: {max_sci_medium}")
print(f"Minimal text length in tokens: {min_sci_medium}")
print(f"Second smallest length: {second_min_sci_medium} ")

[[42, 29886, 1437, 89, 2259, 348, 1394, 1072, 4317, 8791, 84195, 68, 4415, 1376, 55790, 3841, 49825, 8083, 11018, 24788, 747, 13453, 275, 7141, 991, 360, 89, 29230, 602, 610, 3178, 5081, 33902, 478, 3074, 13106, 5176, 30], [42, 78, 3841, 924, 72757, 483, 49825, 8083, 577, 3846, 10176, 15036, 1437, 58197, 84, 650, 18, 1376, 55790, 540, 11018, 220, 16, 1034, 991, 360, 89, 32376, 30], [42, 29886, 6570, 47, 274, 1171, 267, 6723, 8791, 84195, 84, 4415, 27955, 14609, 84195, 32557, 29394, 64, 39294, 1037, 2322, 6181, 602, 32063, 16471, 9110, 274, 1171, 258, 84, 30], [42, 29886, 924, 97134, 285, 5676, 603, 385, 10176, 90762, 14666, 64, 8791, 950, 72, 4415, 1376, 55790, 3841, 49825, 8083, 30], [42, 29886, 49825, 14559, 27262, 84, 8791, 292, 9491, 4415, 3273, 62559, 84, 453, 2431, 72, 2580, 6780, 71, 312, 587, 66, 29230, 66632, 326, 18631, 72, 602, 15036, 11906, 998, 924, 11412, 12453, 7907, 577, 294, 3444, 11906, 57208, 336, 1377, 84, 11906, 23001, 84, 30], [42, 29886, 513, 597, 14833, 333, 393

In [30]:
token_sci_long, count_sci_long, mean_sci_long, std_sci_long, max_sci_long, min_sci_long, second_min_sci_long = tokenize_and_count_length(sci_long)

print(token_sci_short[:100])
print(len(token_sci_short))
print(count_news_short)
print(f"Mean: {mean_sci_short}")
print(f"STD: {std_sci_short}")
print(f"Maximal text length in tokens: {max_sci_short}")
print(f"Minimal text length in tokens: {min_sci_short}")
print(f"Second smallest length: {second_min_sci_short} ")

[[42, 29886, 513, 1376, 404, 52513, 49825, 14559, 11018, 991, 360, 89, 29230, 30], [129, 254, 2629, 924, 49825, 14559, 30], [129, 254, 2629, 4255, 267, 68411, 5697, 1200, 300, 32557, 3273, 1167, 3696, 68411, 3841, 9038, 90165, 30], [42, 78, 3841, 67892, 461, 297, 6723, 834, 531, 85003, 6127, 969, 12453, 34967, 30], [129, 254, 2629, 924, 2840, 402, 7907, 45576, 299, 13453, 77, 3457, 453, 2431, 72, 2580, 6780, 71, 312, 587, 66, 29230, 66632, 326, 18631, 72, 30], [42, 29886, 49825, 14559, 8791, 84195, 84, 4415, 597, 4763, 84, 30], [42, 78, 5697, 1028, 25105, 11755, 4809, 268, 5697, 513, 33054, 62617, 15036, 45103, 7886, 268, 3841, 12703, 81, 12453, 19580, 42236, 425, 30], [129, 254, 2629, 4864, 13871, 81, 4317, 6127, 969, 12453, 28422, 5697, 30], [129, 254, 2629, 4864, 4255, 4150, 6127, 969, 12453, 28422, 5697, 49825, 8083, 829, 834, 716, 70, 43757, 316, 33902, 19456, 267, 316, 38169, 316, 30], [42, 29886, 513, 577, 10784, 53191, 52513, 33902, 19456, 5455, 33902, 478, 560, 30], [129, 254,

### Literature

In [31]:
lit_path = "../datasets/datasets/literature.parquet"
lit_text, lit_short, lit_medium, lit_long = get_data(lit_path)

ŠTAMPANO U DRŽ. ŠTAMPARIJI KRALjEVINE SRBIJE Ala je lepa zemlja to Srpsko Vojvodstvo, - pravo da je Srpska zemlja, al i jeste gnezdo sokolova - baš je Bog krasnu zemlju stvorio za krasan narod! - Lepo je to Srpsko Vojvodstvo, u njemu je Bačka i Banat, prostrana ravnica, ili bolje reći golema bašča od svakog bilja, a osobito od šenice i druge rane, sa livadama, pašama i poljanama, koje oko ne može pregledati, po kojima se odranjuju silne ergele konja i čoporovi druge stoke, kojoj broja nema. U njemu je Srem sa svojim gorama i dolinama, brdima i planinama, sa svojim lepim voćnjacima i vinogradima, sa šumama koje su neprohodne. Plaho Dunavo, plava Tisa i druge ponosite reke jesu bogaština Vojvodstva, kakve nema svaka zemlja. U Vojvodstvu ima belih gradova, lepih i ubavih sela. U Vojvodstvu ima svake prirodne krasote.
425
Koje reke su bogaština Vojvodstva?
425
Kako se opisuje Srpsko Vojvodstvo i koje su njegove karakteristike?
425
Koje su prirodne lepote Srpskog Vojvodstva i kako se opisuj

In [32]:
token_lit_text, count_lit_text, mean_lit_text, std_lit_text, max_lit_text, min_lit_text, second_min_lit_text = tokenize_and_count_length(lit_text)

print(token_lit_text[:100])
print(len(token_lit_text))
print(count_lit_text)
print(f"Mean: {mean_lit_text}")
print(f"STD: {std_lit_text}")
print(f"Maximal text length in tokens: {max_lit_text}")
print(f"Minimal text length in tokens: {min_lit_text}")
print(f"Second smallest length: {second_min_lit_text} ")

[[129, 254, 51, 16407, 55994, 549, 14644, 129, 121, 13, 27006, 254, 51, 16407, 16412, 46164, 735, 52831, 73, 47110, 4069, 21550, 8768, 41429, 89762, 4864, 514, 6733, 1167, 336, 75, 5697, 311, 34688, 1725, 9509, 650, 21963, 92110, 267, 3415, 11, 482, 550, 28316, 3067, 4864, 34688, 1725, 4657, 1167, 336, 75, 5697, 11, 453, 602, 13599, 68, 342, 55506, 3055, 84826, 337, 12949, 482, 13081, 11906, 4864, 42648, 597, 13075, 9110, 1167, 336, 75, 8783, 357, 37215, 822, 15036, 597, 13075, 276, 44669, 347, 0, 482, 445, 752, 78, 4864, 311, 34688, 1725, 9509, 650, 21963, 92110, 267, 3415, 11, 577, 37185, 34163, 4864, 14659, 13453, 4657, 602, 23565, 266, 11, 463, 496, 3444, 436, 31697, 3074, 11, 60409, 21434, 3841, 312, 96881, 733, 273, 1764, 13081, 11906, 13453, 64, 11018, 13871, 587, 540, 20934, 5697, 11, 264, 72757, 6491, 11018, 37524, 268, 560, 602, 1377, 4838, 436, 2194, 11, 829, 15471, 329, 3105, 11, 7251, 11906, 3105, 602, 1499, 23685, 3105, 11, 97599, 5509, 78, 841, 4647, 51492, 20575, 839, 9

In [33]:
token_lit_short, count_lit_short, mean_lit_short, std_lit_short, max_lit_short, min_lit_short, second_min_lit_short = tokenize_and_count_length(lit_short)

print(token_lit_short[:100])
print(len(token_lit_short))
print(count_lit_short)
print(f"Mean: {mean_lit_short}")
print(f"STD: {std_lit_short}")
print(f"Maximal text length in tokens: {max_lit_short}")
print(f"Minimal text length in tokens: {min_lit_short}")
print(f"Second smallest length: {second_min_lit_short} ")

[[42, 78, 3841, 312, 441, 924, 293, 14361, 85578, 2259, 650, 21963, 92110, 267, 6723, 30], [42, 78, 4864, 21236, 822, 1776, 402, 84, 328, 10910, 7675, 30], [42, 78, 4864, 1200, 285, 3524, 24318, 385, 6723, 13453, 12407, 5509, 37737, 84, 577, 15199, 64, 13453, 22873, 436, 561, 1201, 84, 30], [42, 78, 4864, 17332, 52579, 49624, 577, 13528, 385, 6723, 13453, 9509, 73, 42430, 2850, 220, 10336, 23, 13, 10087, 483, 30], [57, 64, 11906, 998, 4864, 389, 32063, 291, 276, 30], [42, 337, 24551, 6747, 936, 4864, 2206, 10888, 12453, 29230, 656, 38423, 30], [42, 29886, 513, 297, 325, 7886, 52513, 864, 1764, 4502, 58084, 11906, 23001, 84, 30], [42, 78, 4864, 17332, 841, 838, 73, 3524, 30], [38, 451, 14659, 13453, 16023, 258, 297, 325, 7886, 64, 21434, 30], [42, 78, 513, 342, 839, 3524, 79481, 92602, 392, 31697, 3457, 30], [42, 78, 4864, 8882, 24012, 348, 3394, 577, 23975, 13453, 8318, 30], [38, 451, 924, 23634, 6181, 597, 8083, 30], [42, 78, 4864, 709, 9700, 50931, 4415, 1200, 300, 32557, 30], [38, 4

In [34]:
token_lit_medium, count_lit_medium, mean_lit_medium, std_lit_medium, max_lit_medium, min_lit_medium, second_min_lit_medium = tokenize_and_count_length(lit_medium)

print(token_lit_medium[:100])
print(len(token_lit_medium))
print(count_lit_medium)
print(f"Mean: {mean_lit_medium}")
print(f"STD: {std_lit_medium}")
print(f"Maximal text length in tokens: {max_lit_medium}")
print(f"Minimal text length in tokens: {min_lit_medium}")
print(f"Second smallest length: {second_min_lit_medium} ")

[[42, 29886, 513, 1200, 285, 34967, 34688, 1725, 9509, 650, 21963, 92110, 267, 3415, 602, 97599, 924, 37185, 797, 1009, 91855, 380, 3043, 30], [42, 78, 5697, 4864, 8725, 14361, 328, 265, 1026, 6780, 71, 13528, 36247, 17544, 577, 19499, 1725, 9509, 73, 220, 5436, 35973, 602, 597, 495, 6198, 30], [42, 29886, 6160, 513, 1200, 285, 6181, 12776, 299, 3315, 297, 15593, 7453, 73, 9115, 58067, 3320, 93952, 30], [42, 29886, 924, 293, 4008, 1536, 84, 13453, 34229, 326, 18631, 72, 60471, 924, 11163, 12574, 15036, 490, 41059, 316, 577, 13528, 385, 6723, 13453, 9509, 73, 42430, 2850, 220, 10336, 23, 13, 10087, 483, 30], [42, 29886, 4864, 22924, 8875, 822, 67107, 3841, 297, 325, 7886, 11305, 864, 1764, 259, 51640, 30], [42, 337, 24551, 6747, 17544, 4864, 2206, 10888, 12453, 29230, 656, 38423, 11018, 14659, 13453, 16023, 2259, 602, 37524, 2629, 3197, 9008, 3067, 81808, 72, 15036, 4247, 53860, 84, 30], [42, 29886, 3197, 404, 52513, 281, 16279, 96881, 1437, 36409, 60471, 4864, 20328, 79, 602, 4502, 580

In [35]:
token_lit_long, count_lit_long, mean_lit_long, std_lit_long, max_lit_long, min_lit_long, second_min_lit_long = tokenize_and_count_length(lit_long)

print(token_lit_long[:100])
print(len(token_lit_long))
print(count_lit_long)
print(f"Mean: {mean_lit_long}")
print(f"STD: {std_lit_long}")
print(f"Maximal text length in tokens: {max_lit_long}")
print(f"Minimal text length in tokens: {min_lit_long}")
print(f"Second smallest length: {second_min_lit_long} ")

[[42, 78, 3841, 924, 12776, 24409, 818, 514, 79, 1295, 34688, 1725, 74, 540, 650, 21963, 92110, 267, 6723, 602, 91617, 513, 1200, 285, 34967, 37185, 797, 12949, 3980, 92241, 3933, 4657, 24788, 77, 337, 1609, 537, 30], [42, 29886, 513, 1200, 285, 34967, 1167, 3458, 13453, 1662, 13528, 36247, 17544, 15036, 19499, 1725, 6780, 44669, 347, 602, 37524, 2629, 513, 951, 18536, 220, 16, 13, 296, 19580, 220, 10336, 23, 13, 10087, 483, 30], [42, 78, 3841, 41383, 582, 32376, 4864, 3320, 93952, 26944, 316, 1509, 78, 577, 67107, 73, 318, 18541, 76, 3105, 887, 84, 96881, 11018, 8909, 329, 2259, 656, 13528, 36247, 17544, 30], [42, 587, 3415, 4864, 293, 18536, 5509, 2739, 12453, 268, 3841, 577, 13528, 385, 6723, 13453, 9509, 73, 42430, 2850, 220, 10336, 23, 13, 10087, 483, 602, 91617, 924, 513, 326, 18631, 72, 1536, 4355, 13453, 4008, 30], [42, 29886, 4864, 312, 351, 869, 3524, 4415, 259, 30885, 602, 91617, 4864, 1167, 40658, 12453, 822, 7802, 11906, 348, 2259, 30], [42, 29886, 82531, 2206, 10888, 1245

In [36]:
# import matplotlib.pyplot as plt
# import numpy as np

# # Data for four datasets (Wikipedia, News, Science, Literature)
# datasets = ['Wikipedia', 'News', 'Science', 'Literature']
# query_types = ['Short', 'Medium', 'Long']

# # Min and max lengths for each dataset and query type
# min_lengths = {
#     'Wikipedia': [min_wiki_short, min_wiki_medium, min_wiki_long],
#     'News': [min_news_short, min_news_medium, min_news_long],
#     'Science': [min_sci_short, min_sci_medium, min_sci_long],
#     'Literature': [min_lit_short, min_lit_medium, min_lit_long]
# }

# max_lengths = {
#     'Wikipedia': [max_wiki_short, max_wiki_medium, max_wiki_long],
#     'News': [max_news_short, max_news_medium, max_news_long],
#     'Science': [max_sci_short, max_sci_medium, max_sci_long],
#     'Literature': [max_lit_short, max_lit_medium, max_lit_long]
# }

# # X-axis positions for each bar group
# x = np.arange(len(query_types))

# # Bar width
# width = 0.2

# # Create a figure and two subplots
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))  # 1 row, 2 columns

# # Colors for the datasets
# colors = ['lemonchiffon', 'lightskyblue', 'lightgreen', 'plum']

# # Plotting min lengths on the first subplot
# for i, dataset in enumerate(datasets):
#     ax1.bar(x + (i - 2)*width, min_lengths[dataset], width, label=dataset, color=colors[i])
# ax1.set_xlabel('Query Types')
# ax1.set_ylabel('Number of Tokens')
# ax1.set_title('Min Token Lengths')
# ax1.set_xticks(x)
# ax1.set_xticklabels(query_types)
# ax1.legend()

# # Plotting max lengths on the second subplot
# for i, dataset in enumerate(datasets):
#     ax2.bar(x + (i - 2)*width, max_lengths[dataset], width, label=dataset, color=colors[i])
# ax2.set_xlabel('Query Types')
# ax2.set_ylabel('Number of Tokens')
# ax2.set_title('Max Token Lengths')
# ax2.set_xticks(x)
# ax2.set_xticklabels(query_types)
# ax2.legend()

# # Adjust the layout and display the plot
# plt.tight_layout()
# plt.show()


In [37]:
min_lengths

NameError: name 'min_lengths' is not defined

In [None]:
max_lengths

{'Wikipedia': [41, 65, 122],
 'News': [45, 62, 86],
 'Science': [47, 85, 245],
 'Literature': [32, 42, 64]}

### MS translated

In [None]:
ms_path = "../datasets/datasets/marco_processed.parquet"
ms = pd.read_parquet(ms_path)
ms

Unnamed: 0,title,context,queries
0,,Hranimo naše veverice u dvorištu tokom jeseni ...,[]
1,,"Životinje ne podnose miris ljudske kose, pa po...",[]
2,,Rasporedite malo ljudske kose oko vaših bašti ...,[Da li ljudska kosa zaustavlja veverice?]
3,,1 Možete posuti krvavu brašnu oko vaše bašte t...,[]
4,,Gubitak dlake kod veverica može biti uzrokovan...,[]
...,...,...,...
31369,,Kuća nije dom bez bočice ikonične Dettol Antis...,[]
31370,,Pod brendom Dettol dostupan je niz dezinfekcio...,[]
31371,,Aktivni sastojak u Dettolu koji mu daje antise...,[]
31372,,Druga upotreba. Dettol se takođe često koristi...,[za šta se koristi Dettol sapun]


In [None]:
ms_text = ms['context'].values.tolist()

In [None]:
print(len(ms_text))
print(ms_text[0])

31374
Hranimo naše veverice u dvorištu tokom jeseni i zime i primetili smo da nekoliko njih ima ćelave delove. Jedna ima mesto bez dlake duž leđa i ispod oba krila. Takođe, druga ima nekoliko ćelavih delova na celom grudnom košu. Sve one jedu i čini se da imaju dobar apetit.


In [None]:
ms_query = []
for i in ms['queries']:
    if len(i) > 0:
        ms_query.extend(i)

In [None]:
print(ms_query)
print(len(ms_query))

['Da li ljudska kosa zaustavlja veverice?', 'Koji su benefiti fosilnih goriva?', 'šta je apotema', 'prosečna cena za prilagođeni baldahin', 'šta je hardver u računaru', 'definicija EDI logistike', 'definicija EDI logistike', 'zašto bi rekreativna upotreba marihuane trebala biti ilegalna', 'U kojoj klasi su paukovi?', 'gde se nalazi amurski leopard', 'koliko košta da sterilizujem svoju ženku kunića', 'kako izgleda Proteus mirabilis', 'da li je bugenvilija žbun', 'da li je oksiflufen bezbedan za pamuk', 'Kakve vrste bakterija se nalaze u ustima', 'prosečna plata glumca', 'koliko često bi muškarci trebalo da urade PSA test', 'šta su finansijska sredstva', 'Skrob i _____ su česti polisaharidni ugljeni hidrati koji se nalaze u biljkama.', 'Skrob i _____ su česti polisaharidni ugljeni hidrati koji se nalaze u biljkama.', 'da li je skladište zajednice Cypress poplavljeno', 'šta je varijansa troškova', 'odakle dolaze bakterije', 'Šta čine elektromagnetni talasi?', 'šta uzrokuje upaljenu gušter

In [None]:
token_ms_text, count_ms_text, mean_ms_text, std_ms_text, max_ms_text, min_ms_text, second_min_ms_text = tokenize_and_count_length(ms_text)

print(token_ms_text[:100])
print(len(token_ms_text))
print(count_ms_text)
print(f"Mean: {mean_ms_text}")
print(f"STD: {std_ms_text}")
print(f"Maximal text length in tokens: {max_ms_text}")
print(f"Minimal text length in tokens: {min_ms_text}")
print(f"Second smallest length: {second_min_ms_text} ")

drive-up
\
pravo 
 prava 
median lethal dose
Splen...
oni
\
\
ful
paene 
i 
insula 
\
[[39, 6713, 11620, 4415, 58067, 348, 2099, 560, 577, 33876, 13915, 11906, 25506, 9825, 316, 65846, 34229, 602, 1167, 547, 602, 9036, 295, 4008, 44673, 3067, 841, 52552, 24551, 37185, 7141, 93793, 220, 7886, 301, 525, 1624, 1009, 13, 66232, 3458, 93793, 296, 52525, 23857, 28613, 731, 3930, 12453, 514, 52096, 64, 602, 374, 40173, 1536, 64, 23975, 10746, 13, 350, 29886, 52096, 68, 11, 5623, 64, 93793, 841, 52552, 24551, 220, 7886, 301, 6321, 71, 1624, 12949, 4415, 19637, 316, 1099, 664, 17101, 15593, 11906, 84, 13, 328, 588, 832, 18806, 84, 602, 33902, 6729, 513, 3067, 737, 52513, 656, 2308, 1469, 295, 275, 13], [129, 121, 16471, 258, 3841, 841, 7661, 77, 974, 8822, 285, 326, 18631, 82, 441, 597, 974, 11, 7251, 1153, 575, 86057, 293, 2850, 73, 486, 11018, 11018, 325, 13453, 1994, 597, 974, 5509, 78, 11412, 58067, 13081, 11906, 668, 11, 60409, 22171, 5770, 24840, 11906, 268, 3841, 577, 1167, 336, 75, 8783

In [None]:
token_ms_query, count_ms_query, mean_ms_query, std_ms_query, max_ms_query, min_ms_query, second_min_ms_query = tokenize_and_count_length(ms_query)

print(token_ms_query[:100])
print(len(token_ms_query))
print(count_ms_query)
print(f"Mean: {mean_ms_query}")
print(f"STD: {std_ms_query}")
print(f"Maximal text length in tokens: {max_ms_query}")
print(f"Minimal text length in tokens: {min_ms_query}")
print(f"Second smallest length: {second_min_ms_query} ")

standard scores range
standard scores range
greg name meaning
bastian name meaning
weight pound designation
prime rib cost
standard franchise fee
[[31516, 908, 326, 18631, 82, 4657, 597, 12252, 15036, 592, 68411, 5697, 348, 2099, 560, 30], [42, 28000, 924, 8935, 72, 49490, 321, 87165, 46298, 10126, 30], [11906, 2629, 4864, 1469, 354, 9355], [782, 325, 13453, 3458, 79006, 15036, 550, 321, 6438, 52096, 34229, 48653, 1494, 258], [11906, 2629, 4864, 2653, 424, 577, 15473, 13453, 359, 31244], [755, 93435, 29230, 16421, 40, 1515, 380, 3043], [755, 93435, 29230, 16421, 40, 1515, 380, 3043], [4458, 11906, 998, 6160, 78381, 1244, 344, 3458, 709, 52262, 4749, 12985, 17156, 2194, 4353, 65, 6181, 2766, 72, 31905, 16876, 3458], [52, 15593, 7453, 73, 20839, 10426, 924, 84295, 74, 46188, 30], [70, 451, 513, 85633, 18291, 1097, 1759, 6780, 99093], [52552, 24551, 15593, 11906, 2629, 3067, 66740, 450, 9832, 336, 67107, 8783, 32063, 268, 12407, 597, 16080, 7886, 64], [74, 29886, 22924, 70, 839, 64, 15542

In [None]:
def sentences_starting_with_words(sentences, words):
    matching_sentences = []
    
    for sentence in sentences:
        if isinstance(words, str):
            if sentence.startswith(words):
                matching_sentences.append(sentence)
        else:
            if sentence.startswith(tuple(words)):
                matching_sentences.append(sentence)
    
    count = len(matching_sentences)
    return matching_sentences, count

In [None]:
words_srb = ["gde", "kako", "kada", "koliko", "ko", "kome", 'kojem', "kom", "zasto", "sta", "šta", "zašto", "zbog čega", "čime", "da li", "ako", "kuda", "čemu", "s kim", "kime", "s čime", "s čim", "je l", "je l'", "je li", "jeste li"]
ms_low = [i.lower() for i in ms_query]
matching_sentences, count = sentences_starting_with_words(ms_low, words_srb)
print("Matching sentences:", matching_sentences)
print("Count:", count)

Matching sentences: ['da li ljudska kosa zaustavlja veverice?', 'koji su benefiti fosilnih goriva?', 'šta je apotema', 'šta je hardver u računaru', 'zašto bi rekreativna upotreba marihuane trebala biti ilegalna', 'gde se nalazi amurski leopard', 'koliko košta da sterilizujem svoju ženku kunića', 'kako izgleda proteus mirabilis', 'da li je bugenvilija žbun', 'da li je oksiflufen bezbedan za pamuk', 'koliko često bi muškarci trebalo da urade psa test', 'šta su finansijska sredstva', 'da li je skladište zajednice cypress poplavljeno', 'šta je varijansa troškova', 'šta čine elektromagnetni talasi?', 'šta uzrokuje upaljenu gušteraču', 'šta znači reč štiglic', 'šta znači reč štiglic', 'šta čini bateriju', 'šta uzrokuje svrab kod infekcije kvascem', 'koja je razlika između maste i lepka za pločice?', 'koliko dugo treba da se kuvaju artičoke na pari', 'zašto je glukoza važna?', 'šta je postsinaptička gustina', 'koji prenose električne impulse ka ćelijskom telu', 'koje reforme je inicirao mihai

In [None]:
nq_path = "../datasets/datasets/naquestions_processed.parquet"
nq = pd.read_parquet(nq_path)
nq

Unnamed: 0,title,context,queries
0,,"Kroz rad Maks Planka, Alberta Einštajna, Luj d...",[ko je predložio da elektroni pokazuju ponašan...
1,,Senat Sjedinjenih Američkih Država se sastoji ...,[koliko senatora ima u Senatu SAD-a]
2,,"Depozicija je termodinamički proces, faza prel...",[promena faze iz gasa u čvrsto se naziva]
3,,Najduža reč u bilo kom od glavnih engleskih je...,[koja je najduža engleska reč u rečniku]
4,,Voda je raspoređena širom Zemlje. Većina vode ...,[koliki procenat Zemljine površine je voda]
...,...,...,...
3784,,Svi automobili moraju imati motor smješten nap...,[Koji motor se nalazi u Holden V8 super automo...
3785,,Međunarodna svemirska stanica viđena 23. maja ...,[kada je Međunarodna svemirska stanica lansira...
3786,,"Rasa mešanac, verovatno delom haski (ili delom...","[Kakav je pas bio Laika, svemirski pas?]"
3787,,`` Mean Old World '' je bluz pesma koju je sni...,[ko je pevao pesmu 'it's a mean old world']


In [None]:
nq_text = nq['context'].values.tolist()
print(nq_text[0])
print(len(nq_text))

Kroz rad Maks Planka, Alberta Einštajna, Luj de Broglia, Artura Komptona, Nilsa Bora i mnogih drugih, trenutna naučna teorija tvrdi da sve čestice takođe imaju prirodu talasa (i obrnuto). Ovaj fenomen je potvrđen ne samo za elementarne čestice, već i za složene čestice poput atoma pa čak i molekula. Za makroskopske čestice, zbog njihovih izuzetno kratkih talasnih dužina, osobine talasa obično ne mogu biti detektovane.
3789


In [None]:
nq_query = []
for i in nq['queries']:
    if len(i) > 0:
        nq_query.extend(i)

print(nq_query[0])
print(len(nq_query))

ko je predložio da elektroni pokazuju ponašanje talasa i čestica
3789


In [None]:
token_nq_query, count_nq_query, mean_nq_query, std_nq_query, max_nq_query, min_nq_query, second_min_nq_query = tokenize_and_count_length(nq_query)

print(token_nq_query[:100])
print(len(token_nq_query))
print(count_nq_query)
print(f"Mean: {mean_nq_query}")
print(f"STD: {std_nq_query}")
print(f"Maximal text length in tokens: {max_nq_query}")
print(f"Minimal text length in tokens: {min_nq_query}")
print(f"Second smallest length: {second_min_nq_query} ")

[[9509, 4864, 4255, 385, 12453, 822, 3067, 63888, 376, 21446, 45295, 40229, 8783, 281, 6863, 11906, 86057, 8374, 15790, 602, 33902, 478, 3074], [52552, 24551, 32797, 64, 93793, 577, 5476, 36409, 328, 1846, 7561], [25475, 7304, 282, 10033, 22924, 6962, 64, 577, 33902, 19456, 34152, 513, 46902, 10126], [9509, 5697, 4864, 30274, 1072, 12453, 64, 2995, 645, 4657, 312, 13453, 577, 312, 13453, 22212, 84], [52552, 7723, 13988, 268, 266, 1901, 336, 53835, 483, 56314, 81, 11906, 483, 4864, 348, 14320], [9509, 5697, 4864, 36222, 292, 29230, 37143, 348, 1171, 74952, 577, 348, 31559], [67408, 587, 273, 3419, 84195, 68, 1043, 64, 11018, 342, 55506, 3315, 11051, 3074, 30], [67408, 587, 273, 21968, 10033, 73404, 72, 577, 308, 3178, 1576, 2201, 73, 42723, 35973, 30], [9509, 4864, 1069, 85, 3524, 1592, 84, 18541, 15479, 15036, 1446, 64274, 2057, 924, 409, 2111, 288, 7870, 30], [52552, 24551, 4248, 450, 14320, 93793, 577, 274, 1171, 77, 7453, 73, 37524, 74, 14559, 294, 9902], [42, 2649, 4864, 550, 6723,

In [None]:
token_nq_text, count_nq_text, mean_nq_text, std_nq_text, max_nq_text, min_nq_text, second_min_nq_text = tokenize_and_count_length(nq_text)

print(token_nq_text[:100])
print(len(token_nq_text))
print(count_nq_text)
print(f"Mean: {mean_nq_text}")
print(f"STD: {std_nq_text}")
print(f"Maximal text length in tokens: {max_nq_text}")
print(f"Minimal text length in tokens: {min_nq_text}")
print(f"Second smallest length: {second_min_nq_text} ")

127 strana
[[42, 65089, 9038, 386, 10011, 1856, 27769, 11, 33654, 18560, 85578, 1662, 3458, 11, 445, 9832, 409, 6031, 6200, 689, 11, 5277, 5808, 27790, 418, 6863, 11, 452, 8839, 64, 426, 6347, 602, 29038, 540, 7141, 5623, 7141, 11, 74700, 332, 3458, 93562, 13453, 3458, 1028, 269, 29230, 11333, 6634, 72, 3067, 81129, 33902, 478, 560, 98133, 52096, 68, 737, 52513, 12776, 4653, 8374, 15790, 320, 72, 39294, 77, 1564, 570, 507, 85, 1662, 44585, 6431, 4864, 3419, 19456, 52096, 268, 841, 83478, 15036, 2449, 277, 818, 33902, 478, 560, 11, 5320, 7886, 602, 15036, 18067, 12453, 1994, 33902, 478, 560, 2477, 332, 520, 7942, 7251, 33902, 587, 602, 35751, 74, 5724, 13, 65808, 52016, 3714, 74, 3806, 441, 33902, 478, 560, 11, 75455, 540, 37185, 7141, 869, 7141, 22924, 5308, 295, 2201, 92114, 6780, 71, 8374, 66636, 7141, 3930, 12453, 2259, 11, 72757, 483, 8374, 15790, 1536, 84195, 2201, 841, 27262, 84, 2766, 72, 3474, 17588, 869, 2194, 13], [20190, 266, 328, 73, 31501, 24041, 7141, 50873, 84195, 6780, 

In [None]:
nq_low = [i.lower() for i in nq_query]
matching_sentences, count = sentences_starting_with_words(nq_low, words_srb)
print("Matching sentences:", matching_sentences)
print("Count:", count)

Matching sentences: ['ko je predložio da elektroni pokazuju ponašanje talasa i čestica', 'koliko senatora ima u senatu sad-a', 'koja je najduža engleska reč u rečniku', 'koliki procenat zemljine površine je voda', 'koja je definicija ph vrednosti u vodi', 'ko je pevao temu pesmu za seriju to su devedesete?', 'koliko epizoda ima u srednjoj školi dxd', 'kada je prva knjiga tamnog tornja objavljena?', 'kada je prvi put korišćena nula u matematičkim operacijama?', 'koji je bio jedan od razloga koje je južna karolina navela za svoju odluku da se odvoji od unije', "ko je originalni pevač pesme you're going to love me", 'ko je postavio temelje škole possibilizma', 'koliko je veliki palace of auburn hills?', 'koliko košta pasoš na filipinima', 'koliko sati rada se smatra punim radnim vremenom', 'ko je pevao pesmu let me tell you about the birds and the bees', 'koliko originalnih kopija starog zaveta postoji', 'ko je nosio američku zastavu na zimskim olimpijskim igrama 2014', 'ko ima više pobed

In [None]:
len(nq_query)

3789

In [49]:
ms_text

['Hranimo naše veverice u dvorištu tokom jeseni i zime i primetili smo da nekoliko njih ima ćelave delove. Jedna ima mesto bez dlake duž leđa i ispod oba krila. Takođe, druga ima nekoliko ćelavih delova na celom grudnom košu. Sve one jedu i čini se da imaju dobar apetit.',
 'Životinje ne podnose miris ljudske kose, pa posipanje barijere od odsečene kose oko vaše bašte, ili lagano unošenje u zemlju prilikom sadnje lukovica, očigledno ima neku vrednost. Čitava stvar me nekako natera da se nasmejem. Nikada mi nije palo na pamet da smo mi ti koji smrde.',
 'Rasporedite malo ljudske kose oko vaših bašti sa povrćem i cvećem. To će oterati veverice jer su ljudi predator veverica. Bolje je ako kosa nije oprana tako da će veverice lako uhvatiti ljudski miris.',
 '1 Možete posuti krvavu brašnu oko vaše bašte takođe. 2 Ne hvatajte i ne premeštajte veverice. 3 Ovo je gubitna bitka jer je populacija veverica izuzetno visoka. 4 Takođe, ako je životinja ženka, postoji velika verovatnoća da ćete je od

In [50]:
train = pd.read_parquet("../datasets/datasets/TRAIN11k_fixed.parquet")

In [51]:
train

Unnamed: 0,context,short_query,medium_query,long_query,keywords,scores
0,UTF-8 UTF-8 varijanta je najzgodnija za kodira...,Šta omogućava UTF-8 varijanta?,Kako se prenosi tekst sa računara na računar k...,Kako je podrška za engleski alfabet evoluirala...,"[UTF-8, varijanta, kodiranje, latinični, tekst...","{'long_query': 5, 'medium_query': 3, 'short_qu..."
1,"""-{ASCII}-"" ili ""-{US-ASCII}-"" standard 1968 ...",Šta je ASCII standard?,Kako su kodne strane podržavale više jezika na...,Kako je prošireni ASCII standard 1968. godine ...,"[ASCII, US-ASCII, standard, karakteri, prošire...","{'long_query': 3, 'medium_query': 5, 'short_qu..."
2,"Tako postoje ""-{Latin1}-"" (""-{ISO-8859-1}-"") ...",Koje su kodne strane za ćirilicu i latinična p...,Koje su osnovne kodne strane za latinična i ći...,Koje su specifične kodne strane poput Windows-...,"[Windows-1251, Windows-1250, KOI8-R, ISO-8859-...","{'long_query': 3, 'medium_query': 5, 'short_qu..."
3,Njen naziv je UCS-2 zato što koristi dva oktet...,Šta je UCS-2?,Kako se rešava problem alokacije prostora za U...,"Koje su prednosti i mane korišćenja UCS-2, UTF...","[Unicode, UTF-16, UTF-8, kodna stranica, trans...","{'long_query': 3, 'medium_query': 4, 'short_qu..."
4,Ova transformaciona šema je prevashodno zgodna...,Koje standarde podržava Mail Transfer Agent?,Koje su metode kodiranja korišćene za prenos v...,Koje su specifičnosti transformacione šeme u v...,"[UTF-8, Mail Transfer Agent, MIME standardi, B...","{'long_query': 3, 'medium_query': 5, 'short_qu..."
...,...,...,...,...,...,...
11709,"Čekmedžijić je osobito navaljivao, pa mora i č...",Ko je bio zadovoljan društvom?,Ko su bili prisutni prilikom odlaska Čekmedžij...,Ko je ležao na divanu nakon što su se Čekmedži...,"[Čekmedžijić, čika-Gavra, Ljuba, frajla, slušk...","{'long_query': 3, 'medium_query': 5, 'short_qu..."
11710,"— Ah, unferšemt! Ja za krajzlera da pođem! Kad...",Ko je imao kuraž da se razgovara sa mnom?,Ko su baroni koji su me gledali sa lornjetom d...,Kako su baroni opisali krajzlera kada sam bila...,"[Beč, krajzler, madam, baroni, lornjet, fortep...","{'long_query': 2, 'medium_query': 3, 'short_qu..."
11711,"Kad bi’ imao tako učenu ženu, ja bi’ mislio da...",Ko traži bogat miraz od devojke?,Zašto siromašne devojke idu u inštitut pre udaje?,Kako siromašne devojke postaju baronke nakon š...,"[baron, miraz, frajla, udaja, siromašne devojke]","{'long_query': 3, 'medium_query': 5, 'short_qu..."
11712,Meni se vrlo dopada što pišete da ćete vašu re...,Ko je pisao pismo frajla-Julki?,Ko je pokazao Čekmedžijiću pisma od frajla-Julke?,Kako je Čekmedžijić reagovao kada je video pis...,"[pismo, frajla-Julka, Ružičić, Čekmedžijić, do...","{'long_query': 3, 'medium_query': 4, 'short_qu..."
