## Zero shot LLama 13b prompting for validation task

* input dataframe with text pairs and additional info LLama 2 should use to make informed decision


In [2]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch import cuda, bfloat16
import transformers
import pandas as pd
# Importing the necessary functions or libraries
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import SequentialChain

In [3]:
torch.clear_autocast_cache

<function torch.clear_autocast_cache>

In [4]:
model_id = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on cuda:0


In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [5]:
device = torch.device('cuda')
print("GPU Name:", torch.cuda.get_device_name(device))
print("Memory Usage:", torch.cuda.memory_allocated(device) / 1024 ** 3, "GB")
print("Max Memory Usage:", torch.cuda.max_memory_allocated(device) / 1024 ** 3, "GB")

GPU Name: NVIDIA A10
Memory Usage: 3.559241771697998 GB
Max Memory Usage: 3.604752540588379 GB


In [9]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max we do not want any randomness here as we want the model to stick to the prompt as closely as possible
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

llm = HuggingFacePipeline(pipeline=generate_text)

### Check for average token length


In [11]:
# Define the token_len function
def token_len(text):
    tokens = tokenizer.encode(
        text
    )
    return len(tokens)

In [71]:
import os

# Function to navigate up 'n' levels
def navigate_up(current_directory, levels):
    for _ in range(levels):
        current_directory = os.path.dirname(current_directory)
    return current_directory

# Get the current working directory
current_directory = os.getcwd()

# Specify the number of levels to navigate up (4 levels in this case)
levels_to_navigate = 4

# Navigate up 'levels_to_navigate' folders
parent_directory = navigate_up(current_directory, levels_to_navigate)

# Define the path to the data file
file_path = os.path.join(parent_directory, 'newspaper_data', 'sample_1percent.csv')

# Now you can open and read the CSV file using pandas
import pandas as pd

df = pd.read_csv(file_path)

In [74]:
# Split the DataFrame into 20 smaller DataFrames for the sake of fast tuning of prompting, each containing 5 rows
# Split the DataFrame into 20 smaller DataFrames, each containing 5 rows
chunk_size = 5
chunks = [df.iloc[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

# Create variables for each smaller DataFrame
for i, chunk in enumerate(chunks):
    globals()[f'df{i + 1}'] = chunk

# Now you have variables df1, df2, df3, ... containing the smaller DataFrames
# You can access and work with them as needed
df1
#df2
#....


Unnamed: 0,Similarity_Score,Text1,Text2,Group,Date1,Date2,Publisher1,Publisher2,ID1,ID2,proper_nouns1,proper_nouns2,keywords1,keywords2
0,0.684651,"De laatste, over integriteit; Hilde Sennema Ze...",Baudet: corona niet bewust wereld in geslinger...,medium,2021-03-01 00:00:00,2021-03-01 10:34:06,Het Financieele Dagblad,NOS nieuws,3290640,3287447,God,"Baudet, Forum voor Democratie-voorman Thierry ...","['sennema', 'hilde', 'humanistische', 'wennen'...","['virussen', 'chinees', 'ebolavirus', 'ingesto..."
1,0.87664,Verkiezingsdebat: Jetten zegt niet op wie hij ...,Einde nadert langzaam voor omstreden investeri...,high,2021-02-28 13:27:28,2021-03-01 00:00:00,NOS nieuws,Trouw,3287516,3285323,"WNL, Rick Nieman, Sigrid Kaag, Nieman, Brinkma...","CDA, Pieter, CDA, euro, WNL, Centraal Planbure...","['leiderschapscrisis', 'omtzigt', 'verkiezings...","['doorrekeningen', 'bevriezing', 'dividendbela..."
2,0.803055,Kaag: vaccinatiebewijs of negatieve test moet ...,De uitzending van 1 maart: Gasvrij duurder dan...,high,2021-03-01 00:00:00,2021-03-01 12:06:47,De Volkskrant,Nieuwsuur,3285227,6290579,"E r, Sigrid Kaag, stadions, D66, Kaag zaterdag","Friese Garijp, Noord-Holland Noord, Jan Nieuwe...","['gevaccineerd', 'gevaccineerden', 'gevaccinee...","['nieuwenburg', 'gasvrij', 'besmettingen', 'aa..."
3,0.858289,Rutte tegen Klaver: U bent aan het overtoepen!...,Stelling 5: gevaccineerde burgers moeten als e...,high,2021-02-28 22:30:43,2021-02-28 23:18:22,NOS liveblog,NOS liveblog,3287472,3287467,Planbureau voor de Leefomgeving,"Wilders, Marijnissen, Marijnissen, Klaver van ...","['klimaatdoelen', 'verkleint', 'klimaatwensen'...","['sneltesten', 'gevaccineerden', 'gevaccineerd..."
4,0.779277,Oud-vluchtelingen popelen om het parlement in ...,Lezersreacties; Vertaling Als schrijfster kan ...,high,2021-02-27 18:49:28,2021-03-01 00:00:00,NOS nieuws,Trouw,3287550,3285370,"Tweede Kamer, Ellian, VVD, Ellian, advocaat va...","Lucas Rijneveld, Scheffer Tytsjerk Smet, Frank...","['afghanistan', 'rechtsgeleerde', 'verantwoord...","['tytsjerk', 'scheffer', 'lezersreacties', 'al..."


In [75]:
# Apply the token_len function to the DataFrame
df1['Token Length_1'] = df1['Text1'].apply(token_len)
df1['Token Length_2'] = df1['Text2'].apply(token_len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Token Length_1'] = df1['Text1'].apply(token_len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Token Length_2'] = df1['Text2'].apply(token_len)


In [76]:
df1

Unnamed: 0,Similarity_Score,Text1,Text2,Group,Date1,Date2,Publisher1,Publisher2,ID1,ID2,proper_nouns1,proper_nouns2,keywords1,keywords2,Token Length_1,Token Length_2
0,0.684651,"De laatste, over integriteit; Hilde Sennema Ze...",Baudet: corona niet bewust wereld in geslinger...,medium,2021-03-01 00:00:00,2021-03-01 10:34:06,Het Financieele Dagblad,NOS nieuws,3290640,3287447,God,"Baudet, Forum voor Democratie-voorman Thierry ...","['sennema', 'hilde', 'humanistische', 'wennen'...","['virussen', 'chinees', 'ebolavirus', 'ingesto...",331,385
1,0.87664,Verkiezingsdebat: Jetten zegt niet op wie hij ...,Einde nadert langzaam voor omstreden investeri...,high,2021-02-28 13:27:28,2021-03-01 00:00:00,NOS nieuws,Trouw,3287516,3285323,"WNL, Rick Nieman, Sigrid Kaag, Nieman, Brinkma...","CDA, Pieter, CDA, euro, WNL, Centraal Planbure...","['leiderschapscrisis', 'omtzigt', 'verkiezings...","['doorrekeningen', 'bevriezing', 'dividendbela...",402,424
2,0.803055,Kaag: vaccinatiebewijs of negatieve test moet ...,De uitzending van 1 maart: Gasvrij duurder dan...,high,2021-03-01 00:00:00,2021-03-01 12:06:47,De Volkskrant,Nieuwsuur,3285227,6290579,"E r, Sigrid Kaag, stadions, D66, Kaag zaterdag","Friese Garijp, Noord-Holland Noord, Jan Nieuwe...","['gevaccineerd', 'gevaccineerden', 'gevaccinee...","['nieuwenburg', 'gasvrij', 'besmettingen', 'aa...",403,402
3,0.858289,Rutte tegen Klaver: U bent aan het overtoepen!...,Stelling 5: gevaccineerde burgers moeten als e...,high,2021-02-28 22:30:43,2021-02-28 23:18:22,NOS liveblog,NOS liveblog,3287472,3287467,Planbureau voor de Leefomgeving,"Wilders, Marijnissen, Marijnissen, Klaver van ...","['klimaatdoelen', 'verkleint', 'klimaatwensen'...","['sneltesten', 'gevaccineerden', 'gevaccineerd...",347,410
4,0.779277,Oud-vluchtelingen popelen om het parlement in ...,Lezersreacties; Vertaling Als schrijfster kan ...,high,2021-02-27 18:49:28,2021-03-01 00:00:00,NOS nieuws,Trouw,3287550,3285370,"Tweede Kamer, Ellian, VVD, Ellian, advocaat va...","Lucas Rijneveld, Scheffer Tytsjerk Smet, Frank...","['afghanistan', 'rechtsgeleerde', 'verantwoord...","['tytsjerk', 'scheffer', 'lezersreacties', 'al...",348,353


from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=2048,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

Topic: The topic is the overarching subject or theme that encompasses all aspects related to elections. In this case, the topic is "elections," which is a broad and recurring subject in the news. It includes various elections happening at different levels of government (e.g., presidential, gubernatorial, local), electoral systems, voting procedures, and political analysis. The topic sets the stage for coverage and discussions surrounding elections.

Story: A story within the context of elections is a specific, often ongoing narrative that focuses on a particular election or related developments. A story can include various articles, reports, and updates from news outlets, all contributing to the coverage of that specific election or its surrounding events. For example, the story might revolve around the presidential election of a specific year, detailing campaign events, candidate profiles, polling data, and key issues.

Event: An event is a singular occurrence or happening within the broader context of an election story. Events are typically noteworthy and can be reported on by multiple news outlets. In the context of elections, an event might be something like a presidential debate, election day itself, the release of election results, or a major campaign rally. Events are the specific milestones or moments that shape the narrative of an election story.

To summarize, "elections" is the overarching topic, "the presidential election of a specific year" is the story that encompasses all coverage related to that election, and "presidential debates," "election day," and "release of election results" are individual events within that story. These distinctions help to clarify how news articles organize their coverage of elections, ensuring that readers can follow and understand the unfolding developments and narratives.

#  Prompting 

Prompting takes shapes in many sequetial instructions. We divide the prompts themselves into system prompt, example prompt, and main prompt to geenrate a template for each subtask. We begin with the broadest level, topic-level matching task. This task is also divided into three sepearate subtasks: (1) create topic labels for eacg text, (2) compare the topic labels and texts to decide to what extent they match, and (3) based on the explanation create a single classification topic match or no topic match. 


## Step 1: Extract topics. 
The prompt template is based on Grootendorst, BERTopic LLama2 implementation with example from our full dataset.
* Important to note that for each step we pass in a system prompt, give and example, and provide a main prompt that signifies the variables and content to be considered.
* Then we create a chain from the prompt for further sequential chaining with LangChain


In [683]:
# Change the system prompt. It describes information given to all conversations
# This system prompt will b
system_prompt_1 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics. A "topic" is a fundamental subject or theme that encompasses all aspects related to a particular area of interest or discussion. A topic is not a summary of a document but the main theme of it. 
It serves as the overarching framework for exploring and discussing various facets within that subject. A main topic is essentially a new category such as: 

(1) Politics: 
(2) Business: 
(3) Health: 
(4) Entertainment: 
(5) Other: 

A topic of a text is always about politics if any political actors, party names, references to elections and legistlation is present in the text. 
A topic of a text is about business if references to business and economy are made without any mention of politics, names of politicians, government. 
A topic of a text is about health if references to aspects of health are mentioned without any mention of politics.
A topic of a text is about entertainment if references to entertainment such as movies, books, actors are made without any mention of politics.
A topic of a text is labelled as other if it does not fit in the politics, business, health, and entertainment categories.

always become about Politics if any politics ralted information in present in the text. A subtopic is a short label that captures the essence of the document. You must always return a main topic and a subtopic and nothing else in the following format: Main topic1 : Subtopic;, Main topic2: Subtopic

<</SYS>>
"""

In [684]:
# Example prompt demonstrating the output we are looking for
example_prompt_1 = """
I have a document pair of the following texts:
- Contact met de kiezer; Geen flyerende lijsttrekkers in windjacks op markten deze keer. Kandidaat-Kamerleden zoeken noodgedwongen hun contact met de kiezer online. Zoals ChristenUnie-lijsttrekker Gert-Jan Segers, hierboven afgebeeld terwijl hij vragen beantwoordt die kiezers hem op online platform Instagram stellen. Anders dan praten met de burgers zelf, praten de politici nu tegen camera's. Populair zijn ook livesessies op Facebook. Zo ging Mark Rutte dit weekend in gesprek met horeca-ondernemers en zond de VVD dat uit op Facebook. Naast de online campagne, werd er dit weekend ook ouderwets geflyerd. Maar aanbellen, dat deden de meeste campagnevoerende partijleden niet, uit angst voor verdere verspreiding van het coronavirus. Forum voor Democratie ging er als enige partij wel op uit om campagne te voeren. Met een vrijheidskaravaan deed de partij Nijmegen en Venlo aan voor een manifestatie. Toen er meer dan tweehonderd mensen langskwamen, moest burgemeester Hubert Bruls de bijeenkomst, die wel aangekondigd en aangevraagd was, voortijdig afbreken. Een bezoeker in Venlo twitterde dat met de komst van Baudet het centrale plein voor het eerst sinds carnaval vorig jaar weer vol stond.
- Forum voor Democratie Jacht op extra stemmen; Waar andere partijen zich nauwelijks op straat wagen toert Forum voor Democratie stad en land af. Die optredens trekken niet alleen de aandacht van kiezers het Openbaar Ministerie kijkt inmiddels of Baudets campagneteam niet op grote schaal de coronaregels geschonden heeft. Zo werden bij een bezoek aan Urk volgens getuigen honderden handen geschud. En dan was er nog de volmacht-rel. In een live-uitzending riep Baudet donderdag zijn kijkers op zoveel mogelijk stemmen per volmacht te regelen. Om de besmettingskansen te verkleinen mogen kiezers dit jaar niet twee maar drie volmachtsstemmen uitbrengen. Een persoon kan vier keer stemmen eigenlijk, als je maar die volmachten kunt regelen, aldus Baudet, en dat was een enorme kans. Ho, zei het ministerie van Binnenlandse Zaken dat is niet de bedoeling en mag helemaal niet. Daar leek het campagneteam van Forum toen al achter gekomen de suggestie om stemmen te regelen was door de partij schielijk uit het filmpje van Baudet geknipt. Aan een ander standpunt houdt Baudet wel openlijk vast de grote kans op verkiezingsfraude. Door anderen, uiteraard.

The topic of each text is described by the following keywords: 'livesessies', 'vrijheidskaravaan', 'flyerende', 'facebook', 'windjacks'; besmettingskansen', 'volmachtsstemmen', 'schielijk', 'volmachten', 'baudets'
The following proper nouns appear in each text: Gert-Jan Segers, Mark Rutte, Forum voor Democratie, Hubert Bruls, Baudet; Forum voor Democratie, Forum voor Democratie, Ministerie kijkt, Urk, Baudet, Baudet, Baudet, Baudet

Based on the information about the topic above, please create a short label of the topic for each text. Make sure to only return the label and nothing more for each text in the following format:

[/INST] Main topic 1: Politics; Subtopic 1: Dutch election campaign and how political parties are adapting to the COVID-19 pandemic; Main topic 2: Politics; Subtopic 2: Dutch election campaign and the controversy surrounding the Forum voor Democratie party's campaign tactics

"""

In [685]:
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt_1 = """
[INST]
I have a document pair of the following texts:
{text1} and {text2}

The topic of each text is described by the following keywords: {keywords1} and {keywords2}
The following proper nouns appear in each text: {proper_nouns1}, {proper_nouns2}

Based on the information about the topic above, please create a short label of this topic for each text. Make sure you to only return the label and nothing more for each text in the following format: Main topic 1 : Subtopic 1 ; Main topic 2: Subtopic 2
[/INST]
"""

In [686]:
prompt_1 = system_prompt_1 + example_prompt_1 + main_prompt_1

In [687]:
# Create a PromptTemplate instance
prompt_template = PromptTemplate(
    input_variables=["text1", "text2", 'proper_nouns1', 'proper_nouns2', 'keywords1', 'keywords2'],
    template=prompt_1
)

# Create the LLMChain instance
chain_1 = LLMChain(llm = llm, prompt = prompt_template, output_key="topics")

In [688]:
# Test if it works
for index, row in df1.iterrows():
    full_text1 = row['Text1']  
    full_text2 = row['Text2']  

    input_variables = {
        "text1": full_text1,
        "text2": full_text2,
        "proper_nouns1": row['proper_nouns1'],
        "proper_nouns2": row['proper_nouns2'],
        "keywords1": row['keywords1'],
        "keywords2": row['keywords2'],
    }

    # Generate text using the chain
    generated_text = chain_1.run(input_variables)
    
    print(generated_text)


Main topic 1: Integrity; Subtopic 1: Personal story of Hilde Sennema and her struggle with cancer

Main topic 2: Conspiracy Theories; Subtopic 2: Thierry Baudet and his views on the coronavirus as a biological weapon

Main topic 1: Politics; Subtopic 1: Debates and campaigns in the Dutch elections; Main topic 2: Economy and finance; Subtopic 2: Controversy over investment subsidies and budget cuts

Main topic 1: Politics; Subtopic 1: Proposal for vaccination proof to reopen public life; Main topic 2: Society; Subtopic 2: Cost of making homes gas-free and impact on the community

Main topic 1: Politics; Subtopic 1: Debate over climate goals and energy policy in the Netherlands; Main topic 2: Health; Subtopic 2: Discussion around vaccination and the distribution of vaccines in the Netherlands

Main topic 1: Politics; Subtopic 1: Candidates with refugee backgrounds in the Dutch parliamentary elections; Main topic 2: Society; Subtopic 2: Reactions to the translation of a famous poem by Ma

## Step 2: Evaluate topic level match
Compare the topics of the text pairs and made eveluation about the match level  

* We create a second chain for this task that uses the texts as well as the extrcated topics as input

In [723]:
# Change the system prompt from the default one to a specific one in order to focus the model on a single task. 
# This system prompt will be
system_prompt_2 = """
<s>[INST] <<SYS>>
wo texts match on a topic level if their topics are similar. For instance, if they are both mentionig aspects of politics, or are touching upon the broader context of politics then this should be considered a topic match even if they mention different aspects of politics. 
Provide your answer as an explanation in maximum 100 tokens. Make sure you to only return the evaluation and nothing else.
<</SYS>>
"""

In [724]:
# Example prompt demonstrating the output we are looking for
example_prompt_2 = """

The topic of each text is the following: 
Main topic 1: Politics; Subtopic 1: Dutch election campaign and how political parties are adapting to the COVID-19 pandemic; Main topic 2: Politics; Subtopic 2: Dutch election campaign and the controversy surrounding the Forum voor Democratie party's campaign tactics

Based on the information about the topics above, please write a short evaluation about whether the two texts match on a topic level. Make sure to only return the evaluation and nothing more in the following format:

[/INST] Evaluation: The main topic of both texts is related to Politics, they touch upun the broader contexts of politics. Their subtopics also match as they discuss aspects related to the Dutch election campaign. Therefore, they are considered topic matches. 

"""

In [725]:
#main prompt describing the task once more and adding the input variables to be considered
main_prompt_2 = """
[INST]

The topic of each text is the following: 
{topics}

Based on the information about the topics above, please write a short evaluation about whether the two texts match on a topic level. Make sure to only return the evaluation and nothing more in the following format:
Evaluation:
[/INST] 
"""

In [726]:
prompt_2 = system_prompt_2 + example_prompt_2 + main_prompt_2

In [727]:
prompt_template_2 = PromptTemplate(input_variables=["topics"], template=prompt_2, batch_size=32, max_iterations = 1)
chain_2 = LLMChain(llm = llm, prompt = prompt_template_2, output_key="topic_evaluation")


In [728]:
#create overall chain to combine previous chains into one big sequential chain

from langchain.chains import SequentialChain

overall_chain = SequentialChain(
                  chains=[chain_1, chain_2], input_variables = ["text1", "text2", "proper_nouns1", "proper_nouns2", 'keywords1', 'keywords2'],output_variables=["topics", "topic_evaluation"],
                  verbose=True )

## Step 3: Create classification label based on evaluation
Provide single label for the match level

* We create a third chain for this task that uses the texts as well as the extracted topics and evaluation as input.
* Labels: topic match, not topic match

In [378]:
# Change the system prompt. It describes information given to all conversations
# This system prompt will be
system_prompt_3 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for classifying whether two texts match on a topic level based on an evaluation provided. 

At each request excplicitly assign one of the two labels below. 
0 - no match
1 - topic match

Make sure you to only return the label and nothing else.
<</SYS>>
"""

In [379]:
# Example prompt demonstrating the output we are looking for
example_prompt_3 = """

The evaluation is the following:
Both texts mention the Dutch election campaign and political parties. Even though they touch upon other subtopics they still refer to aspects of the Dutch election campaign. This is the reason why they are topic matches.

Based on this information, please assign either '0 - no match' or '1 - topic match'. Make sure to only return the label and nothing more in the following format:

[/INST]: 1 - topic match
"""

In [380]:
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt_3 = """
[INST]

The evaluation is the following:
{topic_evaluation}

Based on this information, please assign either '0 - no match' or '1 - topic match'. Make sure to only return the label and nothing more in the following format:
[/INST] 
"""

In [389]:
prompt_3 = system_prompt_3 + example_prompt_3 + main_prompt_3

In [390]:
prompt_template_3 = PromptTemplate(input_variables=[ "topic_evaluation"], template=prompt_3, batch_size=32, max_iterations = 1)
chain_3 = LLMChain(llm = llm, prompt = prompt_template_3, output_key="match_topic")


In [371]:
#create overall chain to combine previous chains into one big sequential chain

from langchain.chains import SequentialChain

overall_chain = SequentialChain(
                  chains=[chain_1, chain_2, chain_3], input_variables = ["text1", "text2", "proper_nouns1", "proper_nouns2", 'keywords1', 'keywords2'],output_variables=["topics", "topic_evaluation", "match_topic"],
                  verbose=True )

In [729]:
#this purely for tests


for index, row in df1.iterrows():
    full_text1 = row['Text1']  
    full_text2 = row['Text2']  

    input_variables = {
        "text1": full_text1,
        "text2": full_text2,
        "proper_nouns1": row['proper_nouns1'],
        "proper_nouns2": row['proper_nouns2'],
        "keywords1": row['keywords1'],
        "keywords2": row['keywords2']

    }

    # Generate text using the chain
    
    results = overall_chain(input_variables)
    print(results)



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
{'text1': 'De laatste, over integriteit; Hilde Sennema Ze stuurde een foto van haar tuin vol krokussen, blij dat ze de lente nog meemaakt. Een tumor in haar hoofd zorgt dat ze niet meer kan lezen en dat ze er binnenkort niet meer is. Ze verontschuldigde zich bij mijn ouders dat ze mijn columns niet meer volgde. Geeft niks, antwoordde mijn moeder, wij lezen ze ook niet altijd. Dat hun vriendin zelf kiest wanneer ze sterft, is iets waar mijn ouders niet aan kunnen wennen. God geeft het leven en neemt het, geloven ze. Maar net zoals ze nu echt wel eens televisiekijken op zondag, zijn ze ook hierover milder geworden. Steeds beter weten ze de wereld buiten de kerk tegemoet te treden. Zonder van hun geloof te vallen sloten ze vriendschappen buiten de zuil, en spreken ze nu met hun humanistische vriendin over goed sterven. Dat vermogen om oude waarden en overtuigingen hun plaats te laten vinden in een nieuwe werkelij

## Step 4: Evaluate news event level match
Compare the texts and evaluate whether they belong to the news event

* We create a third chain for this task that uses the texts as well as the extrcated topics and other inout variables
* input variables on top of the existing ones: date1, date2 + output of chain 1 and chain2, and chain 3

In [396]:
# Change the system prompt. It describes information given to all conversations
# This system prompt will be
system_prompt_4 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for evaluating whether two texts belong to the same news event.
News events are specific events that lead to news coverage, such as a specific debate on a specific day in a specific parliament, a specific accident, or a specific football match. They can be covered by one or more articles in one or more outlets, but relate to one specific and identifiable event and are thus much more fine-grained than news topics, issues, or news categories.
News events can span over multiple days but not more than 10 days. Therefore articles that cover the same news event are published within the same few hours and days. 

An event within a news event is a specific, event or issue and or related developments around the event or issue. A news event can include various articles, reports, and updates from news outlets, all contributing to the coverage of that specific event or issue and its sorrounding aspects. For example, the news event might revolve around the presidential election of a specific year, detailing campaign events, candidate profiles, polling data, and key issues.
Provide your answer as an explanation in maximum 100 tokens. Make sure you to only return the evaluation and nothing else.
<</SYS>>
"""

In [397]:
# Example prompt demonstrating the output we are looking for
example_prompt_4 = """

The topic of each text is the following:
Topic 1: Dutch election campaign and how political parties are adapting to the COVID-19 pandemic; Topic 2: Dutch election campaign and the controversy surrounding the Forum voor Democratie party's campaign tactics

The pubishing dates of the texts is the following:
date1: 01/03/2021; date2: 01/03/2021  

Based on the information above, please write a short evaluation about whether the two texts match on a news event level. Make sure to only return the evaluation and nothing more in the following format:

[/INST] Evaluation: Both texts focus on one particular news event, the elections which is distintive event. Both texts discuss aspects of the same election campaign. The texts were also published on the same date which further indicates that they belong to the same news event. 
"""

In [398]:
# Example prompt demonstrating the output we are looking for
main_prompt_4 = """

The topic of each text is the following:
{topics}

The pubishing dates of the texts is the following:
{date1} and {date2}

Based on the information above, please write a short evaluation about whether the two texts match on a news event level. Make sure to only return the evaluation and nothing more in the following format:
[/INST] 

"""

In [399]:
prompt_4 = system_prompt_4 + example_prompt_4 + main_prompt_4

In [400]:
prompt_template_4 = PromptTemplate(input_variables=["topics", "date1", "date2"], template=prompt_4, batch_size=32, max_iterations = 1)
chain_4 = LLMChain(llm = llm, prompt = prompt_template_4, output_key="event_evaluation")


In [401]:
#create overall chain to combine previous chains into one big sequential chain

from langchain.chains import SequentialChain

overall_chain = SequentialChain(
                  chains=[chain_1, chain_2, chain_3, chain_4], input_variables = ["text1", "text2", "proper_nouns1", "proper_nouns2", 'keywords1', 'keywords2', 'date1', 'date2'],output_variables=["topics", "topic_evaluation", "match_topic","event_evaluation"],
                  verbose=True )

In [None]:
#this purely for tests


for index, row in df1.iterrows():
    full_text1 = row['Text1']  # Get the full text of Text1
    full_text2 = row['Text2']  # Get the full text of Text2

    input_variables = {
        "text1": full_text1,
        "text2": full_text2,
        "proper_nouns1": row['proper_nouns1'],
        "proper_nouns2": row['proper_nouns2'],
        "keywords1": row['keywords1'],
        "keywords2": row['keywords2'],
        "date1":row['Date1'],
        "date2":row['Date2']

    }

    # Generate text using the chain
    
    results = overall_chain(input_variables)
    print(results)



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
{'text1': 'De laatste, over integriteit; Hilde Sennema Ze stuurde een foto van haar tuin vol krokussen, blij dat ze de lente nog meemaakt. Een tumor in haar hoofd zorgt dat ze niet meer kan lezen en dat ze er binnenkort niet meer is. Ze verontschuldigde zich bij mijn ouders dat ze mijn columns niet meer volgde. Geeft niks, antwoordde mijn moeder, wij lezen ze ook niet altijd. Dat hun vriendin zelf kiest wanneer ze sterft, is iets waar mijn ouders niet aan kunnen wennen. God geeft het leven en neemt het, geloven ze. Maar net zoals ze nu echt wel eens televisiekijken op zondag, zijn ze ook hierover milder geworden. Steeds beter weten ze de wereld buiten de kerk tegemoet te treden. Zonder van hun geloof te vallen sloten ze vriendschappen buiten de zuil, en spreken ze nu met hun humanistische vriendin over goed sterven. Dat vermogen om oude waarden en overtuigingen hun plaats te laten vinden in een nieuwe werkelij

### Save the results into the df column

this is to be modified based on all the new output variables

In [102]:
#this will be in final code

# Create empty lists to collect the results
evaluations = []
classifications = []

# Iterating over the DataFrame

for index, row in df.iterrows():
    paragraphs_text1 = row['Text1'].split('\n\n')  # Split the text into paragraphs
    paragraphs_text2 = row['Text2'].split('\n\n')  # Split the text into paragraphs
    
    #Extract the first paragraph, or the first two paragraphs if length < 5
    if len(paragraphs_text1[0]) < 5 and len(paragraphs_text1) > 1:
        first_paragraph_text1 = '\n\n'.join(paragraphs_text1[:2])
    else:
        first_paragraph_text1 = paragraphs_text1[0]

    if len(paragraphs_text2[0]) < 5 and len(paragraphs_text2) > 1:
        first_paragraph_text2 = '\n\n'.join(paragraphs_text2[:2])
    else:
        first_paragraph_text2 = paragraphs_text2[0]
    
    input_variables = {
        "text1": first_paragraph_text1,
        "text2": first_paragraph_text2,
        #"similarity_score": row['Similarity_Score'],
        "proper_nouns1": row['proper_nouns1'],
        "proper_nouns2": row['proper_nouns2'],
        "date1": row['Date1'],
        "date2": row['Date2']
    }

    # Append results to respective lists
    evaluations.append(results['evaluation'])
    classifications.append(results['classification'])

# Add new columns to the DataFrame
df['Evaluation'] = evaluations
df['Classification'] = classifications

# Print the updated DataFrame
df

Unnamed: 0.1,Unnamed: 0,Similarity_Score,Text1,Text2,Group,Date1,Date2,Publisher1,Publisher2,ID1,ID2,Named_Entities1,Named_Entities2,Token Length_1,Token Length_2,chunk1,chunk2,Evaluation,Classification
0,7524236,0.684123,Kijk met een economische bril naar migratie en...,De toekomst van de landbouw ; Op veel punten s...,medium,2021-03-05T00:00:00,2021-03-09T00:00:00,De Volkskrant,Algemeen Dagblad,3288032,3295057,"['Nederlandse', 'Forum van Democratie', 'anti-...","['SGP', 'Partij voor de Dieren', 'PvdD', 'één'...",797,1836,Kijk met een economische bril naar migratie en...,"het ietwat chargerend, maar met een serieuze o...",Final evaluation: The two text snippets are s...,1 (topic-level match).
1,944469,0.611839,Nieuwe dreun voor Schiphol en KLM ; Nieuwe dre...,'Rutte zegt: in de kern zijn we een diep socia...,medium,2021-01-08T00:00:00,2021-02-04T00:00:00,De Telegraaf,De Volkskrant,2676726,2756469,"['KLM\n\n', 'KLM', 'Yteke de Jong\n\nAmsterdam...","['#', 'Tweede', '2021', 'SP', 'Lilian Marijnis...",598,4077,Nieuwe dreun voor Schiphol en KLM ; Nieuwe dre...,Ze is alweer zo'n bekend gezicht op het Binnen...,Final evaluation: The two text snippets are s...,1 (topic-level match).
2,7286249,0.655534,Verkiezingen: Bijna alle partijen gaan nu acht...,"Wanneer een politicus de clown is, hoef je van...",medium,2021-03-03T00:00:00,2021-03-15T00:00:00,Algemeen Dagblad,De Volkskrant,3286360,3298131,"['Mark Rutte', 'Nederland', 'RTL', 'Mark', 'Ar...","['#', 'Mark Rutte', 'Wilders', 'extreemrechts'...",213,1375,Verkiezingen: Bijna alle partijen gaan nu acht...,Buiten een kleine kring van journalisten en po...,Final evaluation: The two text snippets are s...,1 (topic-level match).
