In [None]:
%pip install pathlib
%pip install pandas
%pip install tqdm
%pip install python-dotenv
%pip install langchain
%pip install transformers
%pip install sentence-transformers
%pip install sentencepiece

In [1]:
import os
from dotenv import load_dotenv

# load environment variables from .env file
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [2]:
import pandas as pd
# read df from file

df = pd.read_csv('lines.csv')
df.shape

  df = pd.read_csv('lines.csv')


(94917, 13)

In [3]:
from transformers import LlamaTokenizer

hf_auth = os.environ.get('HF_AUTH')
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf",token=hf_auth)



In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def token_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    length_function=token_len,
    separators=['\n\n', '\n', ' ', '']
)

In [5]:
texts = []
for celexNumber, group  in df.groupby('CELEX number'):
    text = ' '.join([str(line) for line in group['text']])
    texts.append(text)

token_counts = [token_len(doc) for doc in texts]
min_tokens=min(token_counts)
avg_tokens=int(sum(token_counts) / len(token_counts))
max_tokens=max(token_counts)

print(f"""Min: {min_tokens}
Avg: {avg_tokens}
Max: {max_tokens}""")

Min: 269
Avg: 8849
Max: 126133


In [6]:
def getDeepestInfoLevel(row):
    if not pd.isna(row['number6']):
        return 10
    if not pd.isna(row['number5']):
        return 9
    if not pd.isna(row['number4']):
        return 8
    if not pd.isna(row['number3']):
        return 7
    if not pd.isna(row['number2']):
        return 6
    if not pd.isna(row['number1']):
        return 5
    if not pd.isna(row['article']):
        return 4
    if not pd.isna(row['section']):
        return 3
    return 3

levelDict = {
    3: 'section',
    4: 'article',
    5: 'number1',
    6: 'number2',
    7: 'number3',
    8: 'number4',
    9: 'number5',
    10: 'number6',
}

def enumerationToText(Es):
    text = ""
    length = len(Es)
    if length > 0:
        text += "Section: '" + str(Es[0])[:20] + "'"
    if length > 1:
        if Es[1] != None and str(Es[1]) != "" and not pd.isna(Es[1]):
            if isinstance(Es[1], float) and Es[1].is_integer():
                text += ", Article " + str(int(Es[1]))
            else:
                text += ", Article " + str(Es[1])
    if length > 2:
        for i in range(2, length):
            if Es[i] != None and str(Es[i]) != "" and not pd.isna(Es[i]):
                if isinstance(Es[i], float) and Es[i].is_integer():
                    text += ", " + str(int(Es[i]))
                else:
                    text += ", " + str(Es[i])
    return text

def compareEnumberations(newEs, currentEs, newSectionID, currentSectionID):
    if newSectionID != currentSectionID:
        return False, 3
    minlen = min(len(newEs), len(currentEs))
    for i in range(1, minlen):
        if newEs[i] != currentEs[i]:
            return False, i + 3
    return True, _

In [7]:
documents = []

In [8]:
from tqdm import tqdm

entryID = 0

for celexNumber, group  in tqdm(df.groupby('CELEX number')):
    # unstructured case:
    if (group['sectionID'].isna().all()):
        text = ' '.join([str(line) for line in group['text']])
        chunks = text_splitter.split_text(text)
        for i, chunk in enumerate(chunks):
            documents.append(
                {
                    'id': entryID,
                    'CELEX number': celexNumber,
                    'text': chunk,
                    'extras': None,
                }
            )
            entryID += 1
    else:
        currentLevel = 2
        currentEnumberations = []
        currentChunk = ""
        currentSectionID = 1

        currentExtras = set()

        for _, row in group.iterrows():
            newLevel = getDeepestInfoLevel(row)
            newEnumberations = [row[levelDict[i]] for i in range(3,newLevel + 1)]
            newSectionID = row['sectionID']
            areEnumberationsSame, differringLevel = compareEnumberations(newEnumberations, currentEnumberations, newSectionID, currentSectionID)
            currentSectionID = newSectionID

            if newLevel < currentLevel:
                if token_len(currentChunk) > 200:
                    chunks = text_splitter.split_text(currentChunk)
                    for chunk in chunks:
                        documents.append({
                            'id': entryID,
                            'CELEX number': celexNumber,
                            'text': chunk,
                            'extras': "; ".join(currentExtras),
                        })
                        entryID += 1
                    currentChunk = str(row['text'])
                    currentExtras = set()
                else:
                    currentChunk += '\n' + str(row['text'])
                currentLevel = newLevel
                currentEnumberations = newEnumberations


            elif newLevel == currentLevel:
                
                if areEnumberationsSame:
                    currentChunk += '\n' + str(row['text'])
                elif differringLevel == currentLevel and differringLevel > 4:
                    currentChunk += '\n' + str(newEnumberations[-1]) + ' ' + str(row['text'])
                else:
                    if token_len(currentChunk) > 200:
                        chunks = text_splitter.split_text(currentChunk)
                        for chunk in chunks:
                            documents.append({
                                'id': entryID,
                                'CELEX number': celexNumber,
                                'text': chunk,
                                'extras': "; ".join(currentExtras),
                            })
                            entryID += 1
                        currentChunk = str(row['text'])
                        currentExtras = set()
                    else:
                        currentChunk += '\n' + str(row['text'])
                    currentLevel = newLevel
                    currentEnumberations = newEnumberations

            else:
                if token_len(currentChunk) > 200:
                    chunks = text_splitter.split_text(currentChunk)
                    for chunk in chunks:
                        documents.append({
                            'id': entryID,
                            'CELEX number': celexNumber,
                            'text': chunk,
                            'extras': "; ".join(currentExtras),
                        })
                        entryID += 1
                    currentChunk = str(row['text'])
                    currentExtras = set()
                else:
                    currentChunk += '\n' + str(row['text'])
                currentEnumberations = newEnumberations
                currentLevel = newLevel
            
            currentExtras.add(enumerationToText(currentEnumberations))

        
        chunks = text_splitter.split_text(currentChunk)
        for chunk in chunks:
            documents.append({
                'id': entryID,
                'CELEX number': celexNumber,
                'text': chunk,
                'extras': "; ".join(currentExtras),
            })
            entryID += 1
        
documents_df = pd.DataFrame(documents)
documents_df

100%|██████████| 537/537 [03:02<00:00,  2.93it/s]


Unnamed: 0,id,CELEX number,text,extras
0,0,21975A1201(01),Avis juridique important Cooperation Agreement...,
1,1,21975A1201(01),EUROPEAN ATOMIC ENERGY COMMUNITY AND THE INTER...,
2,2,21975A1201(01),Contracting Parties shall consult each other r...,
3,3,21975A1201(01),with respect to items on their agenda in which...,
4,4,21975A1201(01),confidential nature of certain information and...,
...,...,...,...,...
25792,25792,32023R2633,(1) The Annex is subject to the pro rata obli...,Section: 'ANNEX Footnotes'; Section: 'Document'
25793,25793,42009D0913,DECISION TAKEN BY COMMON AGREEMENT BETWEEN THE...,"Section: 'Document', (1); Section: 'Document'"
25794,25794,42009D0913,The location of the seat of this Agency should...,"Section: 'Document', Article 2; Section: 'Docu..."
25795,25795,42010D0349,DECISION TAKEN BY COMMON ACCORD BETWEEN THE RE...,"Section: 'Document', (1); Section: 'Document',..."


In [9]:
# save df to csv file

from pathlib import Path 
filepath = Path('chunks.csv')
filepath.parent.mkdir(parents=True, exist_ok=True) 
documents_df.to_csv(filepath)

In [10]:
print('files processed:' + str(documents_df['CELEX number'].nunique()))

files processed:537
