In [1]:
import pandas as pd
import numpy as np
import re
import os, sys
import tiktoken
sys.path.append("..")

In [2]:
# pd.set_option('display.max_colwidth', None)  # None means no limit on column width
# pd.set_option('display.width', 300)  

sci = pd.read_parquet("../datasets/science_parallel.parquet")
sci.head()

Unnamed: 0,title_en,title_sr,abstract_en,abstract_sr,kewywords_en,kewywords_sr
0,The origins and development of the architectur...,Poreklo i razvoj arhitektonske forme rimokatol...,The aim of the paper is to present an architec...,Predmet rada je proučavanje arhitekture rimoka...,"[Sacral architecture, roman catholic church, a...","[Sakralna arhitektura, rimokatlička Crkva, arh..."
1,Belgrade's General Plan 1923: a comparison of ...,Generalni plan Beograda 1923 komparacija plani...,The doctoral dissertation examines the process...,U doktorskoj disertaciji je istraživan proces ...,"[History, theory, urban lanning, general plan,...","[istorija, teorija, urbanističko planiranje, g..."
2,The importance and role of pedestrian space ne...,Značaj i uloga mreže pešačkih prostora u gener...,City represents its character through urban/pl...,Svoj karakter grad predstavlja putem urbanog m...,"[Pedestrian space network, communication of th...","[Mreža pešačkih prostora, komunikacija grada, ..."
3,Adaptive principles in architectural design,Adaptivni principi u arhitektonskom projektovanju,This study is aiming at the development of arc...,Ovaj rad predstavlja prilog razvoju metodologi...,"[adaptation, analogue model, performative mode...","[adaptacija, analogni model, perfomativni mode..."
4,Transformation of vernacular architecture on t...,Transformacija narodne arhitekture na poluostr...,Vrmac Peninsula is located in the Bay of Kotor...,Poluostrvo Vrmac nalazi se u Bokokotorskom zal...,"[peninsula Vrmac, vernacular transformation, c...","[poluostrvo Vrmac, transformacija narodnih kuć..."


In [3]:
sci['abstract_en'] = sci['abstract_en'].replace('', np.nan)
sci['abstract_sr'] = sci['abstract_sr'].replace('', np.nan)
sci_cleaned = sci.dropna(subset=['abstract_en', 'abstract_sr'], ignore_index=True)
sci_cleaned.shape

(9962, 6)

In [4]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """
    Calculates the number of tokens in a string based on a specified encoding.

    The function uses the specified encoding to tokenize the input string and returns the number of tokens generated.

    Parameters:
    string (str): The input string to be tokenized.
    encoding_name (str): The name of the encoding to use for tokenization.

    Returns:
    int: The number of tokens in the input string based on the specified encoding.

    Example:
    >>> string = "This is a sample sentence."
    >>> encoding_name = "cl100k_base"
    >>> num_tokens = num_tokens_from_string(string, encoding_name)
    >>> print(num_tokens)
    """
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [6]:
tokens_sr_processed = [num_tokens_from_string(i, "cl100k_base") for i in sci_cleaned['abstract_sr']]

In [7]:
print(f"Mean (tokens): {np.mean(tokens_sr_processed)}")
print(f"Max length (tokens): {np.max(tokens_sr_processed)}")
print(f"Min length (tokens): {np.min(tokens_sr_processed)}")


Mean (tokens): 880.2607910058222
Max length (tokens): 11258
Min length (tokens): 16


In [10]:
def tokenize_and_truncate(text, encoding_name="cl100k_base", max_tokens=512):
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    num_tokens = len(tokens)
    if num_tokens > max_tokens:
        tokens = tokens[:max_tokens]  # Truncate to 512 tokens
    processed_abstract = encoding.decode(tokens)  # Decode back to text
    if not processed_abstract.endswith('.'):
        last_period_index = processed_abstract.rfind('.')
        if last_period_index != -1:
            processed_abstract = processed_abstract[:last_period_index + 1]  # Keep everything up to the last "."
            
    return processed_abstract


In [11]:
sci_cleaned['processed_abstract_sr'] = sci_cleaned["abstract_sr"].apply(tokenize_and_truncate)
sci_cleaned['processed_abstract_en'] = sci_cleaned["abstract_en"].apply(tokenize_and_truncate)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sci_cleaned['processed_abstract_sr'] = sci_cleaned["abstract_sr"].apply(tokenize_and_truncate)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sci_cleaned['processed_abstract_en'] = sci_cleaned["abstract_en"].apply(tokenize_and_truncate)


In [12]:
sci_cleaned.head()

Unnamed: 0,title_en,title_sr,abstract_en,abstract_sr,kewywords_en,kewywords_sr,processed_abstract_sr,processed_abstract_en
0,The origins and development of the architectur...,Poreklo i razvoj arhitektonske forme rimokatol...,The aim of the paper is to present an architec...,Predmet rada je proučavanje arhitekture rimoka...,"[Sacral architecture, roman catholic church, a...","[Sakralna arhitektura, rimokatlička Crkva, arh...",Predmet rada je proučavanje arhitekture rimoka...,The aim of the paper is to present an architec...
1,Belgrade's General Plan 1923: a comparison of ...,Generalni plan Beograda 1923 komparacija plani...,The doctoral dissertation examines the process...,U doktorskoj disertaciji je istraživan proces ...,"[History, theory, urban lanning, general plan,...","[istorija, teorija, urbanističko planiranje, g...",U doktorskoj disertaciji je istraživan proces ...,The doctoral dissertation examines the process...
2,The importance and role of pedestrian space ne...,Značaj i uloga mreže pešačkih prostora u gener...,City represents its character through urban/pl...,Svoj karakter grad predstavlja putem urbanog m...,"[Pedestrian space network, communication of th...","[Mreža pešačkih prostora, komunikacija grada, ...",Svoj karakter grad predstavlja putem urbanog m...,City represents its character through urban/pl...
3,Adaptive principles in architectural design,Adaptivni principi u arhitektonskom projektovanju,This study is aiming at the development of arc...,Ovaj rad predstavlja prilog razvoju metodologi...,"[adaptation, analogue model, performative mode...","[adaptacija, analogni model, perfomativni mode...",Ovaj rad predstavlja prilog razvoju metodologi...,This study is aiming at the development of arc...
4,Transformation of vernacular architecture on t...,Transformacija narodne arhitekture na poluostr...,Vrmac Peninsula is located in the Bay of Kotor...,Poluostrvo Vrmac nalazi se u Bokokotorskom zal...,"[peninsula Vrmac, vernacular transformation, c...","[poluostrvo Vrmac, transformacija narodnih kuć...",Poluostrvo Vrmac nalazi se u Bokokotorskom zal...,Vrmac Peninsula is located in the Bay of Kotor...


In [13]:
tokens_sr_processed = [num_tokens_from_string(i, "cl100k_base") for i in sci_cleaned['processed_abstract_sr']]

idxs_sr = []
for idx, i in enumerate(tokens_sr_processed):
    if i == 0:
        idxs_sr.append(idx)

print(f"Mean (tokens): {np.mean(tokens_sr_processed)}")
print(f"Max length (tokens): {np.max(tokens_sr_processed)}")
print(f"Min length (tokens): {np.min(tokens_sr_processed)}")
print(f"Missing: {len(idxs_sr)}")

Mean (tokens): 431.1939369604497
Max length (tokens): 512
Min length (tokens): 16
Missing: 0


In [14]:
tokens_en = [num_tokens_from_string(i, "cl100k_base") for i in sci_cleaned['processed_abstract_en']]

idxs_en = []
for idx_en, i in enumerate(tokens_en):
    if i == 0:
        idxs_en.append(idx)

print(f"Mean (tokens): {np.mean(tokens_en)}")
print(f"Max length (tokens): {np.max(tokens_en)}")
print(f"Min length (tokens): {np.min(tokens_en)}")
print(f"Missing: {len(idxs_en)}")

Mean (tokens): 370.8576591045975
Max length (tokens): 512
Min length (tokens): 8
Missing: 0


In [16]:
def extract_first_sentence(abstract):
    return abstract.split('.')[0] + '.' if '.' in abstract else abstract

In [17]:
df_sentences = pd.DataFrame({
    'sentence_en': sci_cleaned['processed_abstract_en'].apply(extract_first_sentence),
    'sentence_sr': sci_cleaned['processed_abstract_sr'].apply(extract_first_sentence)
})

In [22]:
df_sentences.head()

Unnamed: 0,sentence_en,sentence_sr
0,The aim of the paper is to present an architec...,Predmet rada je proučavanje arhitekture rimoka...
1,The doctoral dissertation examines the process...,U doktorskoj disertaciji je istraživan proces ...
2,City represents its character through urban/pl...,Svoj karakter grad predstavlja putem urbanog m...
3,This study is aiming at the development of arc...,Ovaj rad predstavlja prilog razvoju metodologi...
4,Vrmac Peninsula is located in the Bay of Kotor.,Poluostrvo Vrmac nalazi se u Bokokotorskom zal...


In [25]:
import nltk
from nltk.tokenize import sent_tokenize

# Download the punkt tokenizer if you haven't already
nltk.download('punkt')

sentences_sr = [sent_tokenize(text) for text in sci_cleaned['processed_abstract_sr']]

print(sentences_sr)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/teamihajlov/nltk_data'
    - '/Users/teamihajlov/Projects/SRBedding/.venv/nltk_data'
    - '/Users/teamihajlov/Projects/SRBedding/.venv/share/nltk_data'
    - '/Users/teamihajlov/Projects/SRBedding/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
