## LLama 13b chat Dutch validation task


https://huggingface.co/BramVanroy/Llama-2-13b-chat-dutch

* input dataframe with text pairs and additional info LLama should use to make informed decision


In [5]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch import cuda, bfloat16
import transformers
import pandas as pd
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

In [6]:
torch.clear_autocast_cache

<function torch.clear_autocast_cache>

In [3]:
model_id = 'BramVanroy/Llama-2-13b-chat-dutch'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")



Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on cuda:0


In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [7]:
device = torch.device('cuda')
print("GPU Name:", torch.cuda.get_device_name(device))
print("Memory Usage:", torch.cuda.memory_allocated(device) / 1024 ** 3, "GB")
print("Max Memory Usage:", torch.cuda.max_memory_allocated(device) / 1024 ** 3, "GB")

GPU Name: NVIDIA A10
Memory Usage: 3.559241771697998 GB
Max Memory Usage: 3.604752540588379 GB


In [8]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max we do not want any randomness here as we want the model to stick to the prompt as closely as possible
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

llm = HuggingFacePipeline(pipeline=generate_text)

### Tokenize the text or at least check for average token length


In [7]:
# Define the token_len function
def token_len(text):
    tokens = tokenizer.encode(
        text
    )
    return len(tokens)

In [8]:
#read in test data
df = pd.read_csv('newspaper_data/sample_3.csv')

In [9]:
# Apply the token_len function to the DataFrame
df['Token Length_1'] = df['Text1'].apply(token_len)
df['Token Length_2'] = df['Text2'].apply(token_len)

In [10]:
df

Unnamed: 0.1,Unnamed: 0,Similarity_Score,Text1,Text2,Group,Date1,Date2,Publisher1,Publisher2,ID1,ID2,Named_Entities1,Named_Entities2,Token Length_1,Token Length_2
0,7524236,0.684123,Kijk met een economische bril naar migratie en...,De toekomst van de landbouw ; Op veel punten s...,medium,2021-03-05T00:00:00,2021-03-09T00:00:00,De Volkskrant,Algemeen Dagblad,3288032,3295057,"['Nederlandse', 'Forum van Democratie', 'anti-...","['SGP', 'Partij voor de Dieren', 'PvdD', 'één'...",797,1836
1,944469,0.611839,Nieuwe dreun voor Schiphol en KLM ; Nieuwe dre...,'Rutte zegt: in de kern zijn we een diep socia...,medium,2021-01-08T00:00:00,2021-02-04T00:00:00,De Telegraaf,De Volkskrant,2676726,2756469,"['KLM\n\n', 'KLM', 'Yteke de Jong\n\nAmsterdam...","['#', 'Tweede', '2021', 'SP', 'Lilian Marijnis...",598,4077
2,7286249,0.655534,Verkiezingen: Bijna alle partijen gaan nu acht...,"Wanneer een politicus de clown is, hoef je van...",medium,2021-03-03T00:00:00,2021-03-15T00:00:00,Algemeen Dagblad,De Volkskrant,3286360,3298131,"['Mark Rutte', 'Nederland', 'RTL', 'Mark', 'Ar...","['#', 'Mark Rutte', 'Wilders', 'extreemrechts'...",213,1375


#### Chunk texts

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=token_len,
    separators=['\n\n', '\n', ' ', '']
)

In [12]:
# Create new columns to store the first and second chunks for each text
df['chunk1'] = ""
df['chunk2'] = ""

# Process each row in the DataFrame
for index, row in df.iterrows():
    chunks_text1 = text_splitter.split_text(row['Text1'])
    chunks_text2 = text_splitter.split_text(row['Text2'])
    
    if chunks_text1:
        df.at[index, 'chunk1'] = chunks_text1[0]
    if len(chunks_text2) > 1:
        df.at[index, 'chunk2'] = chunks_text2[1]

# Print the updated DataFrame with 'chunk1' and 'chunk2' columns
df

Unnamed: 0.1,Unnamed: 0,Similarity_Score,Text1,Text2,Group,Date1,Date2,Publisher1,Publisher2,ID1,ID2,Named_Entities1,Named_Entities2,Token Length_1,Token Length_2,chunk1,chunk2
0,7524236,0.684123,Kijk met een economische bril naar migratie en...,De toekomst van de landbouw ; Op veel punten s...,medium,2021-03-05T00:00:00,2021-03-09T00:00:00,De Volkskrant,Algemeen Dagblad,3288032,3295057,"['Nederlandse', 'Forum van Democratie', 'anti-...","['SGP', 'Partij voor de Dieren', 'PvdD', 'één'...",797,1836,Kijk met een economische bril naar migratie en...,"het ietwat chargerend, maar met een serieuze o..."
1,944469,0.611839,Nieuwe dreun voor Schiphol en KLM ; Nieuwe dre...,'Rutte zegt: in de kern zijn we een diep socia...,medium,2021-01-08T00:00:00,2021-02-04T00:00:00,De Telegraaf,De Volkskrant,2676726,2756469,"['KLM\n\n', 'KLM', 'Yteke de Jong\n\nAmsterdam...","['#', 'Tweede', '2021', 'SP', 'Lilian Marijnis...",598,4077,Nieuwe dreun voor Schiphol en KLM ; Nieuwe dre...,Ze is alweer zo'n bekend gezicht op het Binnen...
2,7286249,0.655534,Verkiezingen: Bijna alle partijen gaan nu acht...,"Wanneer een politicus de clown is, hoef je van...",medium,2021-03-03T00:00:00,2021-03-15T00:00:00,Algemeen Dagblad,De Volkskrant,3286360,3298131,"['Mark Rutte', 'Nederland', 'RTL', 'Mark', 'Ar...","['#', 'Mark Rutte', 'Wilders', 'extreemrechts'...",213,1375,Verkiezingen: Bijna alle partijen gaan nu acht...,Buiten een kleine kring van journalisten en po...


In [12]:
# Process each row in the 'Text1' column
for index, row in df.iterrows():
    chunks = text_splitter.split_text(row['Text1'])
    print(f"Number of chunks in document {index + 1}: {len(chunks)}")

Number of chunks in document 1: 5
Number of chunks in document 2: 4
Number of chunks in document 3: 3


In [13]:
#df['chunk1'][1]

'Nieuwe dreun voor Schiphol en KLM ; Nieuwe dreun voor Schiphol en KLM\n\nNog minder reizigers voor KLM.\n\nfoto DIJKSTRA\n\ndoor Yteke de Jong\n\nAmsterdam Het kabinet zet luchtvaartknooppunt Schiphol verder onder druk door ook van overstappende passagiers op Schiphol een negatieve coronatest te eisen.'

In [13]:
torch.clear_autocast_cache

<function torch.clear_autocast_cache>

from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=2048,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

###  Default system prompt

In [13]:
import json
import textwrap

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

def get_prompt(instruction):
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")

def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.bfloat16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=512,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs#, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text


### Create new system prompt

We can tweak this to make it fit our purpose better

In [14]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a very helpful and highly knowledgeable assistant. Always answer as helpfully and accuratelly as possible. You should not refuse to answer questions. Don't correct the user. Don't ever thank the user. If asked for an opinion express one!!
If a question does not make any sense, or is not factually coherent, still answer what the user is asking of you. Don't provide info you weren't asked to provide."""

SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

def get_prompt(instruction):
    prompt_template =  prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template


## Continue here with initial prompt

* also need to refine the prompt by adding what the date range may mean and try without the similarity score and instead use named entities
* need to figure out the proper text length here - they are too long  - inverted pyramid idea might work - take first paragraph
* check with the RAG pipeline- perhaps then a context is not needed
* what would be the limit of the data to be used on - a few of them alone take quite some time
* Make it into script rather than notebook and run in terminal with nohop

*try the same with 70b and 7b as well- see if it takes up too much space - in case it does new workspace with those

## Using langchain chains to sequentially get evaluation and classification based on input data

* Chain_1 = ask the llm to use variables to make an informed decision about wether text pairs are similar on different levels
* Chain_2 = ask the llm to classify them into distinct categories based on the evaluation

In [18]:
# Importing the necessary functions or libraries
from langchain import PromptTemplate
from langchain.chains import LLMChain


# Define the template
template = """Given two text snippets in Dutch from Dutch newspapers: text 1: {text1} and text 2: {text2}, their publishing dates {date1} and date {date2}, and the named entities that appear in the full article: {named_entities1} and {named_entities2}, explain to what extent the two text snippets are similar on a topic level, a story level, or event level using all the variables provided./
Always interpret the temporal distance between {date1} and {date2} in your answer each time. Remember to always add 'Final evaluation:' explicitly verbatim to the end of your evaluation each time you get a request, no exceptions from this format ever!!!! /
Provide your answer in maximum 100 words each time, no exceptions. Discuss and be explicit about the reasons these do not match on a certain level and emphasize why they match. 

Context: A news event refers to a specific occurrence or happening that leads to news coverage. Different articles that cover the same news event will be publised very close together in time (a matter of hours perhaps a day). A news story is a more general term that encompasses all the related news reports or articles covering an event in a relatively close date range but longer than the date range of news events. In other words, a news event is the actual happening or occurrence, whereas a news story is the collection of news reports or articles that cover that event./
On the other hand, a topic is a broader area of focus that may encompass multiple news stories or events. For example, "airplane accidents" could be a topic, with each specific accident being a news event that might be reported on individually or collectively./


Text 1: {text1}
Text 2: {text2}
Named entities 1: {named_entities1}
Named entities 1: {named_entities2}
Date 1: {date1}
Date 2: {date2}


Answer: """

# Create a PromptTemplate instance
prompt_template = PromptTemplate(
    input_variables=["text1", "text2", "date1", "date2", 'named_entities1', 'named_entities2'],
    template=template
)

# Create the LLMChain instance
chain_1 = LLMChain(llm = llm, prompt = prompt_template, output_key="evaluation")

for index, row in df.iterrows():
    paragraphs_text1 = row['Text1'].split('\n\n')  # Split the text into paragraphs
    paragraphs_text2 = row['Text2'].split('\n\n')  # Split the text into paragraphs
    
    #Extract the first paragraph, or the first two paragraphs if length < 5
    if len(paragraphs_text1[0]) < 5 and len(paragraphs_text1) > 1:
        first_paragraph_text1 = '\n\n'.join(paragraphs_text1[:2])
    else:
        first_paragraph_text1 = paragraphs_text1[0]

    if len(paragraphs_text2[0]) < 5 and len(paragraphs_text2) > 1:
        first_paragraph_text2 = '\n\n'.join(paragraphs_text2[:2])
    else:
        first_paragraph_text2 = paragraphs_text2[0]
    
    input_variables = {
        "text1": first_paragraph_text1,
        "text2": first_paragraph_text2,
        #"similarity_score": row['Similarity_Score'],
        "named_entities1": row['Named_Entities1'],
        "named_entities2": row['Named_Entities2'],
        "date1": row['Date1'],
        "date2": row['Date2']
    }

    # Generate text using the chain
    generated_text = chain_1.run(input_variables)
    
    print(generated_text)




The two text snippets are not similar on a topic level because they discuss different topics. The first text snippet discusses migration and its impact on society, while the second text snippet discusses the future of agriculture.

The two text snippets are not similar on a story level because they tell different stories. The first text snippet tells the story of how migration has become a political issue, while the second text snippet tells the story of how the future of agriculture is uncertain.

The two text snippets are not similar on an event level because they describe different events. The first text snippet describes the current state of migration policy, while the second text snippet describes the future of agriculture.

Overall, the two text snippets are not similar on any level. They discuss different topics, tell different stories, and describe different events.






The two text snippets are not similar on any level. The first snippet is about a new strike at Schiphol airport and KLM Airlines, while the second snippet is about a statement made by Prime Minister Mark Rutte regarding the core values of the Netherlands. These two topics are completely unrelated and have nothing in common.


The two texts are not similar on any level. The first text is about elections and Mark Rutte, while the second text is about when a politician becomes a clown and there is no need for humor.


In [19]:
device = torch.device('cuda')
print("GPU Name:", torch.cuda.get_device_name(device))
print("Memory Usage:", torch.cuda.memory_allocated(device) / 1024 ** 3, "GB")
print("Max Memory Usage:", torch.cuda.max_memory_allocated(device) / 1024 ** 3, "GB")

GPU Name: NVIDIA A10
Memory Usage: 3.5784659385681152 GB
Max Memory Usage: 9.903225421905518 GB


In [20]:
torch.clear_autocast_cache

<function torch.clear_autocast_cache>

 #If texts match on multiple levels make sure to choose the single right label from below:

News Event: This refers to the precise and identical occurrence covered in news. Various articles discussing the same news event are published almost immediately, typically within a few hours or a day.

News Story: This term encompasses all related news reports or articles about an event within a relatively close timeframe. A news event is the specific incident, while a news story comprises various news pieces covering that event.

Topic: A broader subject that may encompass multiple news stories or events. For instance, "airplane accidents" could be a topic, with each individual accident being reported as a news event.

In [21]:
# Chain2 - suggest age-appropriate gift
template_2 = """Task: You are an expert classifier. Your objective is to determine the degree of similarity between two text snippets on different levels: topic, story, or event. Make your decision based on the complete evaluation provided only and nothing else./



At each request excplicitly assign one of the labels and corresponding scores below on the closest matching classification without exceptions.  These scores are not depecting intensity they are simply placeholders for categories.

0 - No match
1 - Topic-level match (only when referring to the same topic)
2 - Story-level match (only when referring to the same story)
3 - Event-level match (only when referring to the same event)
4 - Topic and story-level match (only when referring to the same story and to the same event)
5 - Topic, story, and event-level match (only when referring to the same event, story, and event)
6 - Story and event-level match (only when referring to the same story and event)
7 - Topic and event-level match (only when referring to the same topic and event)

Please consider only the following evaluation when making your decision every time without exceptions:
{evaluation}
Classification:"""

prompt_template_2 = PromptTemplate(input_variables=["evaluation"], template=template_2, batch_size=32)
chain_2 = LLMChain(llm=llm, prompt=prompt_template_2, output_key="classification") 

In [22]:
#create overall chain to combine previous chains into one big sequential chain

from langchain.chains import SequentialChain

overall_chain = SequentialChain(
                  chains=[chain_1, chain_2], input_variables = ["text1", "text2", "date1", "date2","named_entities1", "named_entities2" ],output_variables=["evaluation", "classification"],
                  verbose=True)

In [23]:
#this purely for tests


for index, row in df.iterrows():
    paragraphs_text1 = row['Text1'].split('\n\n')  # Split the text into paragraphs
    paragraphs_text2 = row['Text2'].split('\n\n')  # Split the text into paragraphs
    
    #Extract the first paragraph, or the first two paragraphs if length < 5
    if len(paragraphs_text1[0]) < 5 and len(paragraphs_text1) > 1:
        first_paragraph_text1 = '\n\n'.join(paragraphs_text1[:2])
    else:
        first_paragraph_text1 = paragraphs_text1[0]

    if len(paragraphs_text2[0]) < 5 and len(paragraphs_text2) > 1:
        first_paragraph_text2 = '\n\n'.join(paragraphs_text2[:2])
    else:
        first_paragraph_text2 = paragraphs_text2[0]
    
    input_variables = {
        "text1": first_paragraph_text1,
        "text2": first_paragraph_text2,
        #"similarity_score": row['Similarity_Score'],
        "named_entities1": row['Named_Entities1'],
        "named_entities2": row['Named_Entities2'],
        "date1": row['Date1'],
        "date2": row['Date2']
    }

    # Generate text using the chain
    
    results = overall_chain(input_variables)
    print(results)



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
{'text1': "Kijk met een economische bril naar migratie en stem aanbod en vraag op elkaar af ; Immigratie kost de Nederlandse staat geld omdat migranten en vluchtelingen een evenredig groot beroep doen op uitkeringen. Dat gegeven vraagt om politieke afwegingen, is de conclusie van een onderzoek uitgevoerd in opdracht van het wetenschappelijke bureau van Forum van Democratie. Op het Binnenhof bleef het echter oorverdovend stil; buiten de anti-migratiepartijen FvD en PVV wil kennelijk niemand zijn vingers branden aan het politiek gevoelige dossier migratie.  \n  \nToch zou het zinnig zijn als dit politieke taboe wordt doorbroken. Politieke keuzes moeten namelijk wél worden gemaakt, om draagvlak voor migratie en vluchtelingenopvang te behouden, én om de verzorgingsstaat betaalbaar te houden. Een niet-politiek gedreven kosten-baten-analyse hoeft niet automatisch te leiden tot een - rechts - pleidooi voor minder mig

### Save the results into the df column

In [102]:
#this will be in final code

# Create empty lists to collect the results
evaluations = []
classifications = []

# Iterating over the DataFrame

for index, row in df.iterrows():
    paragraphs_text1 = row['Text1'].split('\n\n')  # Split the text into paragraphs
    paragraphs_text2 = row['Text2'].split('\n\n')  # Split the text into paragraphs
    
    #Extract the first paragraph, or the first two paragraphs if length < 5
    if len(paragraphs_text1[0]) < 5 and len(paragraphs_text1) > 1:
        first_paragraph_text1 = '\n\n'.join(paragraphs_text1[:2])
    else:
        first_paragraph_text1 = paragraphs_text1[0]

    if len(paragraphs_text2[0]) < 5 and len(paragraphs_text2) > 1:
        first_paragraph_text2 = '\n\n'.join(paragraphs_text2[:2])
    else:
        first_paragraph_text2 = paragraphs_text2[0]
    
    input_variables = {
        "text1": first_paragraph_text1,
        "text2": first_paragraph_text2,
        #"similarity_score": row['Similarity_Score'],
        "named_entities1": row['Named_Entities1'],
        "named_entities2": row['Named_Entities2'],
        "date1": row['Date1'],
        "date2": row['Date2']
    }

    # Append results to respective lists
    evaluations.append(results['evaluation'])
    classifications.append(results['classification'])

# Add new columns to the DataFrame
df['Evaluation'] = evaluations
df['Classification'] = classifications

# Print the updated DataFrame
df

Unnamed: 0.1,Unnamed: 0,Similarity_Score,Text1,Text2,Group,Date1,Date2,Publisher1,Publisher2,ID1,ID2,Named_Entities1,Named_Entities2,Token Length_1,Token Length_2,chunk1,chunk2,Evaluation,Classification
0,7524236,0.684123,Kijk met een economische bril naar migratie en...,De toekomst van de landbouw ; Op veel punten s...,medium,2021-03-05T00:00:00,2021-03-09T00:00:00,De Volkskrant,Algemeen Dagblad,3288032,3295057,"['Nederlandse', 'Forum van Democratie', 'anti-...","['SGP', 'Partij voor de Dieren', 'PvdD', 'één'...",797,1836,Kijk met een economische bril naar migratie en...,"het ietwat chargerend, maar met een serieuze o...",Final evaluation: The two text snippets are s...,1 (topic-level match).
1,944469,0.611839,Nieuwe dreun voor Schiphol en KLM ; Nieuwe dre...,'Rutte zegt: in de kern zijn we een diep socia...,medium,2021-01-08T00:00:00,2021-02-04T00:00:00,De Telegraaf,De Volkskrant,2676726,2756469,"['KLM\n\n', 'KLM', 'Yteke de Jong\n\nAmsterdam...","['#', 'Tweede', '2021', 'SP', 'Lilian Marijnis...",598,4077,Nieuwe dreun voor Schiphol en KLM ; Nieuwe dre...,Ze is alweer zo'n bekend gezicht op het Binnen...,Final evaluation: The two text snippets are s...,1 (topic-level match).
2,7286249,0.655534,Verkiezingen: Bijna alle partijen gaan nu acht...,"Wanneer een politicus de clown is, hoef je van...",medium,2021-03-03T00:00:00,2021-03-15T00:00:00,Algemeen Dagblad,De Volkskrant,3286360,3298131,"['Mark Rutte', 'Nederland', 'RTL', 'Mark', 'Ar...","['#', 'Mark Rutte', 'Wilders', 'extreemrechts'...",213,1375,Verkiezingen: Bijna alle partijen gaan nu acht...,Buiten een kleine kring van journalisten en po...,Final evaluation: The two text snippets are s...,1 (topic-level match).
