In [1]:
import pandas as pd
import numpy as np
from srbai.Alati.Transliterator import transliterate_cir2lat
from huggingface_hub import hf_hub_download
import json
import tiktoken
import sys
from pprint import pprint
import itertools

sys.path.append("..")

In [2]:
def get_data_science():
    # Download the dataset from Hugging Face Hub
    filepath = hf_hub_download(
        repo_id="jerteh/NARDUS-meta",
        filename='nardus_meta.jsonl', 
        repo_type='dataset'
    )

    filtered_data = []

    # Read the dataset line by line and process entries
    with open(filepath, 'r') as file:
        for line in file:
            entry = json.loads(line.strip())
            data = {
                'id': entry.get('file_name', ''),
                'title_sr': entry.get('title_sr', ''),
                'title_en': entry.get('title_en', ''),
                'abstract_sr': entry.get('dc.description.abstract_sr', []),
                'abstract_en': entry.get('dc.description.abstract_en', []),
                'keywords_sr': entry.get('dc.subject_sr', []),
                'keywords_en': entry.get('dc.subject_en', [])
            }
            
            # Transliterate title and abstract/keywords in Serbian
            if data['title_sr']:
                data['title_sr'] = transliterate_cir2lat(data['title_sr'])
            data['abstract_sr'] = [transliterate_cir2lat(text) for text in data['abstract_sr']]
            data['keywords_sr'] = [transliterate_cir2lat(keyword) for keyword in data['keywords_sr']]
            
            filtered_data.append(data)

    # Filter the dataset based on abstract conditions
    new_data = [
        entry for entry in filtered_data
        if len(entry['abstract_sr']) == 1 and not any(abstract.endswith("...") for abstract in entry['abstract_sr'])
    ]

    return new_data


In [14]:
science = get_data_science()
pprint(science[0])
print(len(science))

{'abstract_en': ['The aim of the paper is to present an architectonic study of '
                 'Roman Catholic churches built in Vojvodina in years 1699. to '
                 '1939. The revitalization of the southern Hungarian region '
                 'followed the end of the domination of the Ottoman Empire in '
                 'the lands of the Habsburg Monarchy (1745-1918, Austrian '
                 'Empire, 1804-1867. & Austro-Hungarian Empire, 1867-1918.) in '
                 'the times after the Austro-Turkish wars and the signing of '
                 'the Karlovci (1699.) and Pozarevac (1718.) Peace Treats. The '
                 'period of political and economic stability, which lasted '
                 'until the outbreak of the First World War, was favorable '
                 'time for a comprehensive expansion of constructing '
                 'activities. The continuity of the construction of religious '
                 'buildings of the state-dominant Roman Ca

In [44]:
df.shape

(6822, 2)

In [39]:
df.isna().sum()

abstract english    0
abstract serbian    0
dtype: int64

In [37]:
df.dropna(inplace=True)

In [38]:
df.shape

(6011, 2)

In [49]:
pd.set_option('display.max_colwidth', None)  # None means no limit on column width
pd.set_option('display.width', 300)  
df.sample(5)

Unnamed: 0,abstract english,abstract serbian
1183,"In this dissertation, brain samples of 10 adult red foxes (Vulpes vulpes), of both sexes, naturally infected with rabies, were examined. Direct immunofluorescence and RT-PCR method were used to prove the presence of rabies virus in the examined material. Histopathologic and immunohistochemical analysis were performed on samples of the cerebral cortex, the hippocampus, the cerebellum and the medulla, which were fixed in buffered 10% formalin, processed in an automatic tissue processor, and embedded into the paraffin. Four to five μm thick paraffin sections were stained with hematoxylin-eosin. Investigation of the presence of viral antigen and pro-inflammatory cytokines was performed with streptavidin-biotin (LSAB) method, with the use of an antibody to a viral antigen, interleukin 1 (IL1-), and tumor necrosis factor (TNF-). In assessing the distribution of viral antigen by regions of the brain semiquantitative analysis was used. The number of infected neurons was determined by morphometric analysis, and these data were analyzed using descriptive statistical parameters, ANOVA and Tukey test. The presence of rabies virus was confirmed in all of 10 examined brains by direct immunofluorescence and RT-PCR. Histopathological examination revealed lesions characteristic for acute nonsuppurative encephalitis, primarily multiplication of glial cells and perivascular infiltrates of lymphocytes and macrophages. In addition, dystrophic changes in neurons, satellitosis, neuronophagia, endotheliosis and leptomeningitis were observed. Negri bodies were present in pyramidal neurons of the cerebral cortex and hippocampus in 30% of cases. The expression of viral antigen has been demonstrated in neuronal perikaryon, axons and dendrites. Its distribution was very broad, and the most intense expression was observed in the hippocampus and medulla oblongata. Expression of IL1- and TNF- was found in areas with inflammatory lesions, in microglial cells, macrophages and in the lymphocytes of perivascular infiltrates and leptomeninges...","U radu je ispitivana fotoliza i fotokatalizovano uklanjanje niskih koncentracija karbamatnih pesticida-insekticida metomila i karbofurana i herbicida desmedifama iz različitih vrsta voda (dejonizovana, destilovana, morska) i iz organskih rastvarača, primenom UV, vidljive ili prirodne sunčeve svetlosti, u prisustvu katalizatora TiO2 i ZnO, takođe i primenom katalizatora Fe-ZSM-5 zeolita i AlFe-pilarnog montmorijonita (foto-Fenton proces). Ispitivan je uticaj različitih parametara kao što su vrsta svetlosti, udaljenost lampe od površine rastvora, vrsta vode, reakciona temperatura, pH vrednost, vrsta organskog rastvarača, početna koncentracija pesticida, početna koncentracija katalizatora, početna koncentracija NaCl na brzinu razgradnje navedenih pesticidnih jedinjenja. Takođe je ispitivan uticaj različitih katalizatora na brzinu fotodegradacije metomila, karbofurana i desmedifama. Stepen fotodekompozicije metomila, karbofurana i desmedifama je određivan primenom UV spektrofotometrije i HPLC analizom, a njihova mineralizacija je ispitivana pomoću jonske hromatografije (IC) i analizom ukupnog sadržaja organskog ugljenika (TOC). Fotohemijsko uklanjanje metomila, karbofurana i desmedifama je prirodan i primenljiv model za prečišćavanje voda."
2175,"Decision-making and management in mining is a demanding and complex task, dealing with risks. Conventional and experiential decision-making approaches, in practice, often show a weakness reflected in insufficient reliability and precision. The subject of the doctoral dissertation research, entitled ""Improving decision-making efficiency in mining using linear optimization models"" is by observing the linear programming models to consider the formation of location models that would be based on the analysis and adaptation of the existing as well as the newly introduced assumptions for cases such as limestone exploitation and consumption. This doctoral dissertation shows the linear model of location exploitation optimization and consumption of limestone in Macedonia, with 29 production entities - open-pit mines, and two options of consumption entities, with 15 and 16 lime consumers. By changing one of the starting hypothesis, the number of consumers, the research shows that mathematical model approach with adequate sensitivity to changes of relative parameters is necessary for a complete and reliable overview of system behavior. The model ensures a more complete and more reliable overview of impact that changes of relevant parameters have on the optimal goal, provides possibilities to experiment by changing the basic hypothesis and the directing effect on the outcome. These hypothesis are illustrated and confirmed by the results in this doctoral dissertation.","Doktorska disertacija ima za cilj da teoretski i empirijski, kontrolicano, sistematski i kritički ispita hipotezu formiranja modela optimizacije postupaka tehničke dijagnostike, pri izboru najznačajnijih parametara u određivanju sigurnosti funkcionisanja sastavnih komponenti sklopova hidroelektrane. Određenim vrednostima analizirane pouzdanosti određen je univerzalni model optimizacije na osnovu model blok dijagrama i mernih mesta. NJihovim međusobnim uticajem i povezanošću formira se: korelacija parametara optimizacije mehaničkih oscilacija i temperature, korelacija parametara optimizacije mehaničkih oscilacija i pohabanosti ležajeva i korelacija parametara optimizacije radnih temperatura i pohabanosti leđajeva. Ovakav model je univerzalnog tipa jer se može primeniti i na složene sisteme, bez obzira na dimenzije komponenti sklopova."
87,"Oxidative stress and chronic inflammation are considered to be the main causes of diabetic complications, one of which is liver damage. An important mediator of these processes may be the endogenous HMGB1 protein, when released into the extracellular environment from the necrotic, damaged or activated cells. As the HMGB1 role in diabetes was insufficiently studied, in this doctoral dissertation the contribution of HMGB1 to liver damage of streptozotocin-induced diabetic rats was investigated. It has been shown that the level of liver damage in diabetes correlates with the presence of extracellular HMGB1. In diabetic liver, this protein is structurally modified by acetylation, phosphorylation, and O-GlcNAc glycosylation, which correlates with its translocation from the nucleus to the cytoplasm and an increase in its presence in the liver and serum. Reduction of the level of extracellular HMGB1 by melatonin or ethyl pyruvate treatment of diabetic rats, shows that HMGB1 contributes to diabetic liver damage by maintaining a chronic inflammation, by lowering antioxidant defense and by reducing regeneration. Extracellular HMGB1 activates MAPK/NF-κB p65 and JAK1/STAT3 signaling pathways through interactions with the TLR4 receptor, thus contributing increased production of proinflammatory cytokines TNF-α and IL-6 and the acute-phase protein, haptoglobin. By stimulating the NF-κB p65 inflammatory pathway, HMGB1 acts negatively on the cytoprotective response of the diabetic liver, by disabling Nrf2 protein activity, which is responsible for reduction of inflammation and antioxidant enzymes production. Activated HMGB1/TLR4 axis reduces regenerative potential of the liver by increasing the presence of negative cell cycle regulators - proteins p53 and p21, and also by decreasing the level of cyclin D1. The obtained results indicate the complexity of HMGB1 protein action in diabetes and underlines the importance of preventing the release of HMGB1 or blockage of HMGB1/TLR4 axis in order to delay the occurrence of liver damage.","Analiziran je diverzitet i sezonska dinamika mikromiceta na zidnim slikama i u vazduhu istraživane crkve. Identifikovana je mikobiota od 46 taksona mikromiceta, sa dominacijom Aspergillus, Penicillium i Cladosporium vrsta. Najveći diverzitet je zabeležen tokom leta, dok je najveća brojnost propagula po jedinici površine (“fungalni otisak”) konstatovana u zimu. Aeromikobiota je okarakterisana sa 33 taksona gljiva. Kontaminacija vazduha propagulama gljiva tokom godine višestruko je prevazilazila standarde za zatvorene prostore. Mikroskopska analiza biofilma pokazala je da su reproduktivne strukture Cladosporium sp. i Chaetomium sp. u kontaktu bojenog sloja i maltera glavni biotski faktor deterioracije. Metodom ATP bioluminiscenije, 75% površina zidnih slika je okarakterisano kao “Zona opasnosti”. Monitoringom indukovane i spontane kolonizacije na modelu zidne slike zaključeno je da su lihenizovane i mikrokolonijalne gljive uzrok fenomena “biopitting”. Testirani izolati su demonstrirali veliki potencijal deterioracije zidnih slika u eksperimentima in vitro. BAC i novosintetisani BAC/FNP nanokompozit pokazali su dobru antifungalnu aktivnost, kao i uticaj na povećanje produkcije aflatoksina B1 i smanjenje produkcije ohratoksina A. Etarsko ulje tamjana imalo je jači antifungalni efekat u odnosu na etarsko ulje smirne, ali slabiji u poređenju sa smešom ulja. Dim tamjana poseduje inhibitorno dejstvo na germinaciju konidija, mehaničkim i hemijskim dejstvom voštanog sloja deponovanog iz dima, in vitro. Tretman vazduha crkve dimom tamjana redukovao je nivo kontaminacije vazduha fungalnim propagulama za približno 80%. Preko predloženog metodološkog protokola, data je mogućnost implementacije rezultata ovog istraživanja u praksu konzervacije i restauracije kulturnih dobara."
1544,"The risky choice framing refers to emphasizing of either positive or negative aspects of outcomes of sure and risky option in decision-making tasks. The framing effect is observed when different descriptions of the same outcome lead to the preference reversal. The aim of the conducted study was to map the situations in which framing will change the choice between sure and risky option, or more specifically, to define task conditions under which frame influences (and under which it does not influence) decision making. Persistence and size of a risky choice framing effect were tested in six experimental studies by varying the parameters of deep and surface structure of decision tasks, in three domains of decision making (lives, money and health). Results show that framing effects are limited by the specific parameters of both deep and surface structure of decision tasks. Incomplete deep structure of task, as well as some aspects of surface structure, are underpinning factors of framing effect. On the other hand, with fully described problems with no ambiguity regarding the outcomes (tasks which had complete deep structure), no framing effects emerged. In conclusion, a deep structure of the task defines persistence of framing effect, while surface structure influences the size of the framing effect. Besides, framing shows different patterns of effects on decisions about health, money and human lives. Alternative explanations of the findings and implications are discussed in light of both descriptive (prospect theory and fuzzy trace theory) and normative (expected utility theory) theories of decision-making.","Osnovna karakteristika novog društveno-kulturnog konteksta jeste to što su digitalne tehnologije postale sastavni deo svih aspekata savremenog života – učenja, komunikacije, obavljanja poslova i slobodnog vremena. U takvim okolnostima ukazuje se na nužnost da se digitalne tehnologije pravilno upotrebljavaju, odnosno da se razvijaju kompetencije koje odgovaraju zahtevima digitalnih, umreženih i na znanjima zasnovanih društava; čime digitalna pismenost dobija status „životne veštine“, koja uz čitalačku i matematičku pismenost, postaje „uslov, ali i pravo“ za sve građane (OECD, 2001). Pomenute okolnosti, u kontekstu obrazovanja, otvaraju niz pitanja i dilema za obrazovnu politiku i nauku, istraživače i praktičare, i vode do novih uvida, saznanja, redefinisanja postojećih i kreiranja novih koncepata i fenomena. U skladu sa tim, u ovom radu, razmatrano je koje su to nove veštine i kompetencije neophodne za potpunu participaciju u digitalnom društvu; šta je uslovilo i na koji način su redefinisane koncepcije pismenosti; i gde i na koji način treba razvijati nove veštine i kompetencije, odnosno digitalnu pismenost. Iz odgovora na pomenuta pitanja proistekli su predmet i cilj ovog istraživanja, kojim se teži ka sagledavanju uloge škole i nastavnika u procesu razvijanja digitalne pismenosti kod učenika, odnosno ispitivanju karakteristika prakse razvijanja digitalne pismenosti u kontekstu postojećih školskih uslova, kao i mogućnosti njenog unapređenja. Istraživanjem je obuhvaćeno 12 osnovnih škola iz gradskih i prigradskih opština na teritoriji Beograda, a uzorak čini 157 nastavnika predmetne nastave i 396 učenika osmog razreda. Korišćena je deskriptivno-analitička metoda sa kombinacijom kvantitativnih i kvalitativnih tehnika: anketiranje, skaliranje i fokus grupno intervjuisanje. Rezultati su pokazali da učenici i nastavnici na različite načine razumeju koncept digitalne pismenosti i da se na osnovu dobijenih podataka ne može izvesti zaključak da je među glavnim akterima obrazovnog procesa prisutan digitalni jaz. Prepoznat je značaj formalnog obrazovanja za razvijanje digitalne pismenosti koja se shvata kao međupredmetna kompetencija. Rezultati su pokazali da praksa razvijanja digitalne pismenosti kao međupredmetne kompetencije u osnovnoj školi nije u dovoljnoj meri uspostavljena, iako postoji praksa korišćenja digitalnih tehnologija u procesima nastave i za potrebe učenja. Utvrđeno je da se praksa razvijanja digitalne pismenosti može predvideti na osnovu nastavničkih veština za upotrebu digitalnih tehnologija u nastavi, broja pohađanih obuka iz domena digitalnih tehnologija, kao i na osnovu školske klime, odnosno kako se procenjuje podrška škole i nastavnika za upotrebu digitalnih tehnologija u nastavi"
4251,"The subject of this research are the famous marks and their broaden protection, with particular focus on criteria for their identification. The first aim of dissertation is to define a degree of distinctiveness which brings the status of famous mark. The second aim of this research is to define concrete factors for establishing the fame of a mark, and to get closer to the precise, accepted method for identification of famous marks in market. The doctoral dissertation contains two parts, beside the Introduction and the Conclusion. In the first part, the subject of analyses was legal regime of broaden protection of famous marks. This part is divided into three sections. The Section I explores the term of a famous mark, with reference to the legislation in Republic of Serbia and European Union, but also to the legislation regulating this matter in past. After that, the three central terms defining the famous marks were analysed – the reputation, distinctiveness and connection between a mark and a good which is a subject of designation. After that, the functions of famous marks were examined and also the influence that the broaden protection has to the holders and other market players. This led to establishing the term of famous marks in its normative and functional meaning. In Section II, the term of reputation was analysed, with special focus on different interpretations of its meaning: the consequences of too broad and too narrow interpretation of this term. Afterwards, the different models of famous trademark infringement were examined; having in mind the specific consequences which are requirements for broaden protection. The associative link in the minds of relevant public participants when seeing the mark was examined in details, in order to establish whether the quality of the same could be the central method for famous marks identification. At the end of this part, the decisions from court practice were analysed, in order to establish if the institute is being interpreted in the unique manner by the relevant authorities. The Part III deals with the reasons of broaden protection of famous marks, with the aim to determine the casual link between the reasons and the degree of fame which should enable the broaden protection. A special focus was made to the economical justification of broaden protection, and also the other theories which differently explain the ratio of this institute were explored.","Od 1990. godine, čitav region Istočne i Jugoistočne Evrope počeo je sa transformacijom svojih ekonomija iz centralno planskog sistema u tržišni, što se označava pojmom tranzicija. Postavka istraživačke studije je sagledavanje stanja sektora poljoprivrede Republike Srbije i Bosne i Hercegovine, koji su prošli kroz proces ekonomske tranzicije. Ideja tranzicije privrede, odnosno poljoprivrede je rast proizvodnje i BDP, povećanje efikasnosti i ostvarivanje boljih proizvodno-ekonomskih performansi. Cilj istraživanja je utvrditi da li je i u kojoj meri tranzicija poljoprivrede dovela do njenog razvoja, koji su propusti napravljeni i kakve mere agrarne politike kreirati u funkciji daljeg razvoja poljoprivrede i ruralnih područja. U izvođenju vrednosnih sudova i kritičkih zapažanja celokupnog procesa tranzicije agrosektora korišćene su određene naučne metode poput naučnog studiranja, indikatori, grafički metod, statistički metod, deskriptivna analiza i sinteza, komparatvini metod i drugi naučni metodi. Strategija poljoprivredne tranzicije u bivšim socijalističkim zemljama imala je za cilj poboljšanje efikasnosti i produktivnosti poljoprivrede zamenom institucionalnih i organizacionih karakteristika komandne ekonomije sa atributima pozajmljenim iz prakse tržišne ekonomije. Transformacija od kolektivne do efikasnije individualizovane poljoprivrede (privatizacija), koja će ostvarivati veći nivo prihoda je krajnji cilj. Tranziciju poljoprivrede prati nepovoljna vlasnička struktura i niska produktivnost, neefikasnost agrarne politike, spor razvoj institucija podrške, neadekvatan zakonodavni okvir i neuspešna privatizacija. Proces privatizacije obeležen je sa mnogo kontroverzi i zloupotreba, pa je i to imalo negativan odraz na ukupne efekte. Pored nabrojanog, efekti tranzicije poljoprivrede se ogledaju i u smanjenom obimu investicija, padu stočarske proizvodnje, spoljnotrgovinska razmena uglavnom sirovina i proizvoda niskog stepena finalizacije, depopulaciji sela, nepovoljnoj agrarnoj strukturi, malim pomacima na planu podiznja konkurentnosti i produktivnosti i nekonzistentnoj agrarnoj politici. Ni poljoprivredno zadrugarstvo nije revitalizovano, a kamoli ostvarilo neki značajan uspeh, tim pre, jer nije vraćeno poljuljano poverenje u zadružni oblik organizovanja, niti su zadružni principi usvojeni. Prelazak na tržišnu ekonomiju ostavio je Srbiju i Bosnu i Hercegovinu daleko iza najuspešnijih zemalja Centralne i Istočne Evrope. BiH zaostaje po mnogim pitanjima vezanim za strukturne reforme, koje su pratile ekonomsku tranziciju zemalja u regionu. U Republici Srbiji je stanje nešto bolje, ali svakako ispod očekivanja i s velikim kašnjenjima, pa su samim tim pozitivni efekti značajno slabiji. Stanje u BiH u posttranzicionom periodu je takvo da nedostaje institucionalna podrška i podsticajne mere agrarne politike na svim nivoima, počev od države preko kantona i opština, a tržišne reforme poljoprivrednog sektora nailaze na probleme i zastoje. Tranzicija poljoprivrede u BiH nije donela očekivane efekte, s obzirom da nije modernizovana, niti je zasnovana na efikasnosti, konkurentnosti, intenzivnosti i tržišnosti. Nešto bolja situacija je u Republici Srbiji, ali svakako nije razvijen sektor poljoprivrede, niti su u potpunosti ostvareni zacrtani ciljevi tranzicije."


In [41]:
data_path ="../datasets/sceince_abstracts.jsonl"
with open(data_path, "w", encoding="utf-8") as f:
    file = json.dumps(filtered_data, ensure_ascii=False)
    f.write(file)

SyntaxError: invalid syntax (1110953104.py, line 2)

In [46]:
def load_json(save_path):
    with open(save_path, "r", encoding="utf-8") as f:
        loaded = json.load(f)
        
    return loaded

# load_json(data_path)

In [47]:
contexts = load_json("../datasets/sceince_abstracts.jsonl")

In [None]:
contexts = []
for i in new:
    text = i['abstract_sr']
    contexts.append(text)

In [None]:
print(contexts[100])
print(len(contexts))
print(type(contexts[0]))

In [None]:
contexts_new = [" ".join(i) for i in contexts]
print(contexts_new[100:105])
print(len(contexts_new))
print(type(contexts_new[0]))

In [48]:
contexts

[{'title_sr': 'Poreklo i razvoj arhitektonske forme rimokatoličkih crkava XVIII i XIX veka u Vojvodini',
  'title_en': 'The origins and development of the architectural form of Roman Catholic churches in Vojvodina in the 18th and 19th century',
  'abstract_sr': ['Predmet rada je proučavanje arhitekture rimokatoličkih crkava građenih u periodu od 1699. do 1939. godine na području Vojvodine. U vremenima koja su usledila po završetku austro-turskih ratova i konačnog prestanka dominacije Otomanske imperije na prostoru Habsburške monarhije (1745- 1918, odnosno carevine Austrije od 1804. i Austro-Ugarske monarhističke unije od 1867), sa potpisivanjem Karlovačkog (1699) i Požarevačkog mira (1718), stvoreni su uslovi za obnovu južnih Ugarskih oblasti. Vreme društvenog i ekonomskog prosperiteta, koji je trajao sve do izbijanja Prvog svetskog rata, stvorilo je uslove za sveobuhvatnu ekspanziju graditeljskih aktivnosti. Kontinuitet izgradnje sakralnih građevina državno dominantne rimokatoličke ko

In [57]:
# Create a DataFrame with the relevant columns
df = pd.DataFrame([{
    'title_en': entry['title_en'],
    'title_sr':entry['title_sr'],
    'abstract_en': ' '.join(entry['abstract_en']),  # Joining list into a single string
    'abstract_sr': ' '.join(entry['abstract_sr']),
    'kewywords_en' : entry['keywords_en'],
    'kewywords_sr' : entry['keywords_sr']

} for entry in contexts])

# Display the DataFrame
df.head()

Unnamed: 0,title_en,title_sr,abstract_en,abstract_sr,kewywords_en,kewywords_sr
0,The origins and development of the architectural form of Roman Catholic churches in Vojvodina in the 18th and 19th century,Poreklo i razvoj arhitektonske forme rimokatoličkih crkava XVIII i XIX veka u Vojvodini,"The aim of the paper is to present an architectonic study of Roman Catholic churches built in Vojvodina in years 1699. to 1939. The revitalization of the southern Hungarian region followed the end of the domination of the Ottoman Empire in the lands of the Habsburg Monarchy (1745-1918, Austrian Empire, 1804-1867. & Austro-Hungarian Empire, 1867-1918.) in the times after the Austro-Turkish wars and the signing of the Karlovci (1699.) and Pozarevac (1718.) Peace Treats. The period of political and economic stability, which lasted until the outbreak of the First World War, was favorable time for a comprehensive expansion of constructing activities. The continuity of the construction of religious buildings of the state-dominant Roman Catholic religion, can be followed after this period, until the outbreak of the Second World War (1939). For nearly two and a half centuries on the territory of Vojvodina, 231 Roman Catholic religious buildings were built, including 3 cathedrals, 184 parish churches and 44 covenant, family or cemetery chapels, in 166 towns and villages in Vojvodina. The main problems the research was focused in were the spatial concepts and correlation between form and function of these structures, origins and typology of architectural forms, the influence of the Central European centers and the surrounding areas and the ways in which discursive cultural, socio-political and religious climate manifested in architecture of studied ecclesiastical buildings. The theoretical framework of the research was a phenomenon of the contemporary, dominant national cultural policy - the discourse of official culture, art and architecture of the Monarchy (1745-1918.), later Austria (1804-1867.) and the Austro-Hungarian Empire (1867/1918.) on the one side and Christian theology, symbolism and liturgical paradigm as a determinant of architectural formation of the liturgical space on the other. A complex research process (22 archives and museums in 6 countries) and an extensive analysis of information on the researched subject (414 used and referent bibliography units and 88 units of published sources, archival documents and records and Schematisms were collected) preceded the definition of general theoretical knowledge. Photo and technical documentation of the interior and exterior of the current situation for most of the studied objects (162) was completed during three years terrain research period. Having studied a large number of examples and having observed them from different value aspects the exemplary that is representative buildings were sorted out. Beside formal archtectural nature, architectural expression, style and historical significance, the examples were chosen by the role of an sacral object in the urban space of a village as well as the role of the object within the church organization. As a final result of the research, a typological systematization of the studied objects was presented according the specific features of the spacial concept, the other formal architectonic characteristics, as well as according correlation of a building with its immediate and wider environment. Seven major typological groups were sorted out. The key factors that determine a type were defined as: a) plan development, as a result of the internal functional spatial organization, b) use of constructions in the function of an interior design, c) architectural composition that determines the mass of the outer volume and d) social climate and urban contextual factors. Architectural language in interior and exterior design, determined by dominant style in architecture, turns out not to be crucial for systematization of the architectural types. However, the dominance of certain styles in certain time periods determines not only the application of the style and decoration, but also the character of the internal space. Correlation of a sacral building to the wider environment had great importance in the study of transmission and flow of influence and allows geographic hedging and paving the main changes in the period. Applied Matrix method, including a systematic study of each researched object according eight established criteria, has completely confirmed the hypotheses set at the beginning. The investigation work conducted on the spatial concept of Roman Catholic churches built during the 18, 19. and IX the first decades of the 20. century (1699-1939.) in Vojvodina can identify the evolution of thought and the guiding principle of the spatial organization solutions and the dominant style in ecclesiastical architecture during the researched period.","Predmet rada je proučavanje arhitekture rimokatoličkih crkava građenih u periodu od 1699. do 1939. godine na području Vojvodine. U vremenima koja su usledila po završetku austro-turskih ratova i konačnog prestanka dominacije Otomanske imperije na prostoru Habsburške monarhije (1745- 1918, odnosno carevine Austrije od 1804. i Austro-Ugarske monarhističke unije od 1867), sa potpisivanjem Karlovačkog (1699) i Požarevačkog mira (1718), stvoreni su uslovi za obnovu južnih Ugarskih oblasti. Vreme društvenog i ekonomskog prosperiteta, koji je trajao sve do izbijanja Prvog svetskog rata, stvorilo je uslove za sveobuhvatnu ekspanziju graditeljskih aktivnosti. Kontinuitet izgradnje sakralnih građevina državno dominantne rimokatoličke konfesije, može se pratiti i nakon ovog perioda, sve do izbijanja Drugog svetskog rata (1939.). Tokom skoro dva i po veka izgrađen je na području Vojvodine 231 rimokatolički sakralni objekat, od čega 3 katedralna hrama, 184 župne crkve i 44 zavetne, porodične ili grobljanske kapele, u 166 vojvođanskih gradova i sela. Proučavanje prostornog koncepta i odnosa funkcije i forme ovih objekata, poreklo i tipologija arhitektonskih oblika, trasiranje uticaja iz centara srednje Evrope i neposrednog okruženja i naučno argumentovano utvrđivanje i objašnjavanje načina na koji se diskurzivna kulturološka, društveno-politička i religiozna klima manifestovala u arhitekturi proučavanih crkvenih građevina, osnovni su problemi na koje je istraživanje fokusirano. Teorijski okvir istraživanju predstavlja fenomen savremene, dominantne državne kulturne politike – diskurs oficijelne kulture, umetnosti i arhitekture monarhije (Austrije i Austro-Ugarske) sa jedne strane, i hrišćanske teologije, simbolike i liturgijskih paradigmi kao odrednica arhitektonskog bogoslužbenog V prostora sa druge. Definisanju opštih teorijskih znanja prethodio je kompleksan istraživački proces (22 arhiva i muzeja u 6 zemalja) i obimna analiza informacija o predmetu istraživanja (korišćena i referentna literatura obuhvata 414 bibliografke jedinice i 88 jedinica objavljenih izvora, evidencije arhivske građe i šematizama). U periodu od tri godine za većinu proučavanih objekata (162) urađena je kompletna fotodokumentacija enterijera i eksterijera i delimična tehnička dokumentacija postojećeg stanja. Proučavanjem velikog broja primera i njihovom procenom iz ugla različitih vrednosnih aspekata izvojili su se uzorni / reprezentativni objekti, na šta je pored formalih odlika arhitekture, arhitektonskog izraza, stila i istorijskog značaja, uticala i uloga objekta u okvirima urbanog profila naselja u kome je hram izgrađen i uloga objekta u okvirima crkvene organizacije. Kao konačni rezultat istraživačkog postupka urađena je, kroz sagledavanje procesa razvoja crkvenih građevina, tipološka sistematizacija istraživanih objekata, prema specifičnostima prostornog koncepta i drugih formalnih odlika arhitekture, kao i odnosa građevine sa njenim neposrednim i širim okruženjem, u sedam osnovnih tipoloških grupa. Konstatovano je da su ključni faktori koji određuju tip: a) razvoj plana, kao rezultat unutrašnje funkcionalne organizacije prostora, b) upotreba konstrukcije u funkciji oblikovanja unutrašnjeg prostora, c) arhitektonska kompozicija masa koja određuje spoljni volumen i d) društvena klima i urbani kontekstualni faktori. Arhitektonski jezik enterijera i eksterijera, koji su određeni vladajućim stilom u arhitekturi, nisu od presudnog značaja za sistematizaicju arhitektonskih tipova, ali dominacija određenih stilova u pojedinim vremenskim periodima određuje ne samo primenu stilsko-dekorativnog programa nego i budući karakter unutrašnjeg prostora. Odnos građevine prema širem okruženju ima velikog značaja kod proučavanja načina i tokova prenošenja uticaja i omogućava da se geografski omeđe i trasiraju glavne promene u istraživanom periodu. Primenjeni matrični metod, koji uključjue analizu svakog proučavanog objekta po osnovu osam utvrđenih kriterijuma u celosti je potvrdio postavljenu hipotezeu da se istraživanjem prostornog koncepta rimokatoličkih crkava podignutih u periodu 18, 19. i prvih decenija 20. veka (1699-1939) na području Vojvodine može se pratiti evolucija u shvatanju organizacije prostornog rešenja crkvene građevine i vladajućih stilskih vrednosti tokom posmatrnog perioda.","[Sacral architecture, roman catholic church, architectural typology, new era, barock, classicism, neogothic]","[Sakralna arhitektura, rimokatlička Crkva, arhitektonska tipologija, Liturgija, barok, klasicizam, neogotika]"
1,Belgrade's General Plan 1923: a comparison of planned to completed.,Generalni plan Beograda 1923 komparacija planiranog i ostvarenog,"The doctoral dissertation examines the process of preparing, elaborating and implementing Belgrade's General Urban Plan...","U doktorskoj disertaciji je istraživan proces pripreme, izrade i sprovođenja Generalnog plana Beogrda iz 1923-24. godine...","[History, theory, urban lanning, general plan, vision]","[istorija, teorija, urbanističko planiranje, generalni plan, Vizije, realizacije]"
2,The importance and role of pedestrian space network i generating the competitive identity of city,Značaj i uloga mreže pešačkih prostora u generisanju kompetitivnog identiteta grada,"City represents its character through urban/place marketing by using their tools, which is one of the most important activities of local government in its development process. Communication, as one of the marketing tools, is achieved through the products made in the variety of design processes, where urban design is also classified. Elements that are relevant in the communication of the city can be physical factors of its environment, including the pedestrian space. The image of the city represents opinion or experience of the audience/users, based not on the good intentions or marketing attractive commercial messages, but rather on the real situation. This concept provides an opportunity for the analysis of the actual pedestrian space network’s quality. In this way, the level of social responsibility of local government can be determined, which influence the creation of certain audience/users attitudes, as well as the image of the city. Furthermore, the importance of pedestrian spaces network in creating, fostering and improving the competitive identity of the city represents the specific field of application of the research results in the strategic decision-making and planning. The validation of the determined relation was made on the examples of pedestrian space networks of five European cities, which were singled out as the relevant representatives of different categories set out by ESPON. Based on the results of these five case studies, the models were created, as well as recommendations and guidelines for improving pedestrian space network in the Belgrade center.","Svoj karakter grad predstavlja putem urbanog marketinga, odnosno marketinga mesta, koristeći se njihovim instrumentima, što je jedna od najznačajnijih delatnosti lokalne uprave u okviru razvojnog procesa. Kao jedan od marketinških instrumenata koji se koristi jeste komunikacija ostvarena putem proizvoda nastalih u različitim dizajnerskim procesima, gde se svrstavaju i proizvodi urbanog dizajna. Elementi koji se uočavaju u komunikaciji grada mogu biti fizički činitelji njegovog prostornog okvira, među kojima i pešački prostor. Imidž grada predstavlja stav ili doživljaj publike koji se zasniva na realizovanom stanju, a ne na prezentovanju dobrih namera i plasiranju atraktivnih propagandnih poruka. Tako određen pojam daje mogućnost da se analizom aktuelnog kvaliteta mreže pešačkih prostora može utvrditi nivo društvene odgovornosti lokalne uprave, što utiče na formiranje stavova publike, pa samim tim i imidža određenog grada. Pored toga uočavanjem značaja mreže pešačkih prostora u kreiranju, negovanju i unapređenju kompetitivnog identiteta grada osvetljava se polje konkretne primene rezultata istraživanja u strateškom odlučivanju i planiranju. Provera utvrđenog odnosa rađena je na primerima mreže pešačkih prostora u centru pet evropskih gradova koji su izdvojeni kao relevantni predstavnici različitih kategorija gradova utvrđenim na osnovu ESPON kategorizacije. Na osnovu njih formirani su modeli, kao i preporuke i smernice unapređenja i širenja mreže peščakih prostora. Poseban doprinos ostvaren je i u vidu utvrđivanja preporuka i smernica za unapređenje mreže peščakih prostora centra Beograda.","[Pedestrian space network, communication of the city, image of the city, competitive identity of the city, Paris, Munich, Vienna, Malme, Ljubljana, Belgrade]","[Mreža pešačkih prostora, komunikacija grada, imidž grada, kompetitivni identitet grada, Pariz, Minhen, Beč, Malme, Ljubljana, Beograd.]"
3,Adaptive principles in architectural design,Adaptivni principi u arhitektonskom projektovanju,This study is aiming at the development of architectural workflowthrough better understanding of the role of the adaptive principles in the conception and the production of the architectural models...,Ovaj rad predstavlja prilog razvoju metodologije arhitektonskog projektovanja i bavi se proučavanjem adaptivnih principa u koncepciji i realizaciji arhitektonskih modela...,"[adaptation, analogue model, performative model, proyotypical model, relational modeling, genetic algoritm]","[adaptacija, analogni model, perfomativni model, prototipski model, relaciono modelovanje, genetski algoritam]"
4,Transformation of vernacular architecture on the Vrmac peninsula - Bay of Kotor during the XX century,Transformacija narodne arhitekture na poluostrvu Vrmac - Boka Kotorska u 20. veku,"Vrmac Peninsula is located in the Bay of Kotor. By its position it belongs to the costal part of the Balkans. The subject matter of this research is the development and transformation of different models of vernacular dwellings and house remains on the Vrmac Peninsula (Bay of Kotor) during the XX century. The goal of this study is to gain insight into the impact that social changes, economic development, demographics, transformation of the urban structure of the settlements, enhancement of building materials and introduction of new structures, as well as professional skills and building ideas have on the spatial characteristics and the shape of the traditional (vernacular) residential architecture in the region. Typological research of the spatial structure of the residential buildings proved that the traditional types of vernacular civil engineering in this region were influenced by urban architecture. The Mediterranean models and standards were followed, but suited to the needs and possibilities of the dwellers. Typological analysis of architectural forms and patterns resulted in the definition of the causes that led to the changes through different phases until the end of the XX century, as well as the discovery of the origins of some forms of interior arrangement in the residential units and their configuration. Façade composition analysis led to some conclusions related to the building layout, parcel and foundation forms, as well as the spatial arrangement and their influence on the formation of the exterior style. Regarded as a whole, the main result of the research was the conclusion that during the analyzed period the transformation of the vernacular dwellings and house remains was spontaneous without any interventions of regulatory rules. More complex changes were manifested in the interior spatial arrangement and to a lesser degree in the façade forms where the changes were basically inadequate.","Poluostrvo Vrmac nalazi se u Bokokotorskom zalivu, a svojim položajem pripada primor- skom delu Balkanskog poluostrva. Predmet istraživanja ovog rada je razvoj i transformacija različitih modela narodne ku- će i kućišta na poluostrvu Vrmac (Boka kotorska) u 20. veku. Cilj ovog rada bio je da se sagleda uticaj društvenih promena, ekonomskog razvoja, demo- grafskih kretanja, transformacije urbane strukture naselja, unapređenje građevinskih mate- rijala i uvođenje novih konstrukcija, kao i stručnih sposobnosti i ideja graditelja na prostorne i oblikovne karakteristike stambene narodne arhitekture ovoga kraja. Sprovedena tipološka proučavanja prostorne strukture stambenih jedinica pokazala su da su tradicionalni oblici narodnog graditeljstva na ovom prostoru bili pod velikim uticajem građanske arhitekture. Oni su sledili mediteranske uzore i standarde, ali prilagođene po- trebama i mogućnostima domaćih korisnika. Tipološke analize omogućile su i da se putem analize oblika i forme definišu uzroci promena koje su nastale od momenta postanka kuća, kroz različite etape do kraja 20. veka, kao i da se otkrije poreklo pojedinih oblika unutrašnje organizacije stambenih jedinica i nji- hovog sklopa. Analiza kompozicije fasada omogućila je donošenje određenih zaključaka vezanih za uti- caj dispozicije kuće, oblika parcele i osnove, kao i unutrašnje prostorne organizacije na oblikovanje spoljašnjeg izgleda. Posmatrano u celini, može se reći da je, kao glavni rezultat rada, proizašao zaključak da je tokom posmatranog perioda transformacija narodnih kuća i kućišta bila spontana i bez upliva zakonske regulative. Složenije promene ispoljile su se kod unutrašnje prostorne organizacije, a manje kod ob- likovanja fasada, na kojima su promene uglavnom bile neadekvatne.","[peninsula Vrmac, vernacular transformation, civil engineering legacy, residential achitecture.]","[poluostrvo Vrmac, transformacija narodnih kuća, graditeljsko nasleđe, stambena arhitektura]"


In [59]:
df.to_parquet("../datasets/science_parallel.parquet")

In [53]:
df.isna().sum()

title          0
abstract_en    0
abstract_sr    0
dtype: int64

In [54]:
df.shape

(13289, 3)

In [None]:
df.sample(10)

In [None]:
titles = [title.get('title_sr', []) for title in new]
print(titles[:5])
print(len(titles))

In [None]:
def tokenize_and_count_length(texts, model="text-embedding-3-small"):
    tokenized_texts = []
    token_counts = []
    tokenizer = tiktoken.encoding_for_model(model)
    
    for text in texts:
        tokenized_text = tokenizer.encode(text)
        tokenized_texts.append(tokenized_text)

    for i, text in enumerate(tokenized_texts):
        token_count = len(text)      
        if token_count < 100:
            print(texts[i]) 
        token_counts.append(token_count)
    
    filtered_counts = [count for count in token_counts if count > 0]

    mean = np.mean(token_counts)
    std = np.std(token_counts)
    maximum = np.max(token_counts)

    if len(filtered_counts) >= 2:
        sorted_counts = sorted(set(filtered_counts))
        minimum = sorted_counts[0]
        second_minimum = sorted_counts[1]
    elif len(filtered_counts) == 1:
        minimum = second_minimum = filtered_counts[0]
    else:
        minimum = second_minimum = 0

    return tokenized_texts, token_counts, mean, std, maximum, minimum, second_minimum

In [None]:
tokenized_texts, token_counts, mean, std, maximum, minimum, second_minimum = tokenize_and_count_length(contexts_new)

In [None]:
print(tokenized_texts[:100])
print(len(tokenized_texts))
print(token_counts)
print(f"Mean: {mean}")
print(f"STD: {std}")
print(f"Maximal text length in tokens: {maximum}")
print(f"Minimal text length in tokens: {minimum}")
print(f"Second smallest length: {second_minimum} ")

In [None]:
tokenized_title, token_counts_title, mean_title, std_title, maximum_title, minimum_title, second_minimum_title = tokenize_and_count_length(titles)

In [None]:
print(tokenized_title[:100])
print(len(tokenized_title))
print(token_counts_title)
print(f"Mean: {mean_title}")
print(f"STD: {std_title}")
print(f"Maximal text length in tokens: {maximum_title}")
print(f"Minimal text length in tokens: {minimum_title}")
print(f"Second smallest length: {second_minimum_title} ")