# Llama 7b for validation task

* input: dataframe with text pairs and additional info LLama 2 should use to make informed decision
* output: dataframe with additional columns: topic, topic match evalutation, topic match classification, news event, news event match evaluation, news event match classification, final classification on topic-level and news event level matching


## Import packages

In [1]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch import cuda, bfloat16
import transformers
import pandas as pd
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import os

## Load model though huggingface pipeline

In [2]:
hf_auth_file = '../../analysis/hf_auth.txt'

# Read the API token from the file
with open(hf_auth_file, "r") as file:
    hf_auth = file.read().strip()  # Remove leading/trailing whitespaces

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded on cuda:0


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [4]:
device = torch.device('cuda')
print("GPU Name:", torch.cuda.get_device_name(device))
print("Memory Usage:", torch.cuda.memory_allocated(device) / 1024 ** 3, "GB")
print("Max Memory Usage:", torch.cuda.max_memory_allocated(device) / 1024 ** 3, "GB")

GPU Name: NVIDIA A10
Memory Usage: 1.9429621696472168 GB
Max Memory Usage: 1.9639196395874023 GB


In [6]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    temperature=0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max we do not want any randomness here as we want the model to stick to the prompt as closely as possible
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

llm = HuggingFacePipeline(pipeline=generate_text)

## Read in data file


In [61]:
# Function to navigate up 'n' levels
def navigate_up(current_directory, levels):
    for _ in range(levels):
        current_directory = os.path.dirname(current_directory)
    return current_directory

# Get the current working directory
current_directory = os.getcwd()

# Specify the number of levels to navigate up (4 levels in this case)
levels_to_navigate = 4

# Navigate up 'levels_to_navigate' folders
parent_directory = navigate_up(current_directory, levels_to_navigate)

# Define the path to the data file
file_path = os.path.join(parent_directory, 'newspaper_data', 'sample_1percent.csv')

# Now you can open and read the CSV file using pandas
import pandas as pd

df = pd.read_csv(file_path)

In [62]:
# Split the DataFrame into 20 smaller DataFrames for the sake of fast tuning of prompting, each containing 5 rows
# Split the DataFrame into 20 smaller DataFrames, each containing 5 rows
chunk_size = 5
chunks = [df.iloc[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

# Create variables for each smaller DataFrame
for i, chunk in enumerate(chunks):
    globals()[f'df{i + 1}'] = chunk

# Now you have variables df1, df2, df3, ... containing the smaller DataFrames
# You can access and work with them as needed
df2
#df2
#....


Unnamed: 0,Similarity_Score,Text1,Text2,Group,Date1,Date2,Publisher1,Publisher2,ID1,ID2,proper_nouns1,proper_nouns2,keywords1,keywords2
5,0.765668,Alle pijlen zijn gericht op Rutte in RTL-debat...,Helft van de Forum-stemmers ziet complot; De h...,high,2021-03-01 00:00:00,2021-03-01 00:00:00,De Volkskrant,Trouw,3285226,3285337,"Radio 1, RTL, VVD, kernboodschap gelieve, Sigr...","Forum voor Democratie, Ipsos","['lijsttrekkersdebat', 'premiersdebat', 'paree...","['ipsos', 'coronavirus', 'gefabriceerd', 'comp..."
6,0.683993,Hoogste bestuursrechter liet forse steken vall...,Stelling 3: Om de klimaatdoelen te halen moet ...,medium,2021-02-28 00:00:00,2021-02-28 22:22:56,Het Financieele Dagblad,NOS liveblog,3290695,3287474,"Andr Bosman, VVD, Tweede Kamer, Raad van State...","VVD, Poetin","['overheidsinstantie', 'kinderopvangtoeslagen'...","['klimaatdoelen', 'kerncentrales', 'rusland', ..."
7,0.848039,Planbureau: vertrek bedrijven reëel risico bij...,Baudet: corona niet bewust wereld in geslinger...,high,2021-03-01 00:00:00,2021-03-01 10:34:06,Het Financieele Dagblad,NOS nieuws,3290604,6290556,"Planbureau, Planbureau voor de Leefomgeving, P...","Baudet, Forum voor Democratie-voorman Thierry ...","['broeikasgasuitstoot', 'klimaatwinst', 'leefo...","['virussen', 'chinees', 'ebolavirus', 'ingesto..."
8,0.805225,Recht op reparatie van apparatuur komt steeds ...,"Niet een lijsttrekker, maar een kiezer brengt ...",high,2021-03-01 00:00:00,2021-03-01 00:00:00,Het Financieele Dagblad,NRC Handelsblad,3290567,3285627,"REPAIR, CAF S Jeroen Groot, Philips, Leenman, ...","Mark Rutte, RTL, Mark Rutte, Mark Rutte, Geert...","['verwarmingselement', 'reparateurs', 'koffiez...","['lijsttrekkersdebat', 'toeslagenaffaire', 'on..."
9,0.631177,Dode en gewonde door zuurstofexplosie corona-a...,Wilders in de schijnwerpers; Wilders in de sch...,medium,2021-02-27 22:39:37,2021-03-01 00:00:00,NOS liveblog,De Telegraaf,3287529,3286364,"Oekra, Twintig, Oekra","Wilders, Mark Rutte, Sigrid Kaag D66, Wilders,...","['tsjernivtsi', 'zaporizja', 'zuurstofexplosie...","['diversiteitsquota', 'ronald', 'rtl', 'zit', ..."


#  Prompting 

Prompting takes shapes in many sequetial instructions. We divide the prompts themselves into system prompt, example prompt, and main prompt to geenrate a template for each subtask. We begin with the broadest level, topic-level matching task. This task is also divided into three sepearate subtasks: (1) create topic labels for eacg text, (2) compare the topic labels and texts to decide to what extent they match, and (3) based on the explanation create a single classification topic match or no topic match. 


## Step 1: Extract topics. 
The prompt template is based on Grootendorst, BERTopic LLama2 implementation with example from our full dataset.
* Important to note that for each step we pass in a system prompt, give and example, and provide a main prompt that signifies the variables and content to be considered.
* Then we create a chain from the prompt for further sequential chaining with LangChain
* Very important here to intially extract main topic and subtopic in order to obtain clear topics. If subtopics are not requested then the model might not understand that a topic that mentions politicians and conspiracy theories belongs to the broader topic of politics and instead it may label it as conspiracy theories alone. 


In [7]:
system_prompt_1 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics. A "topic" is a fundamental subject or theme that encompasses all aspects related to a particular area of interest or discussion. 
A topic serves as the overarching framework for exploring and discussing various facets within that subject. A topic comprises of a main topic and a subtopic. A main topic is an overarching theme, a subtopic is a more specific thematic or content-based divisions within a broader main topic. All main topics are labeled Politics if the documents' keywords and proper nouns relate to politics. For instance if the text discusses the economy but a politician, party, or government is mentioned either in the text or in the keywords then it should be categorized as Politics and not Economy.  
Main topic: Politics; Subtopic: Elections and campaigns
Main topic: Economy; Subtopic: Interest rates
Main topic: Health; Subtopic: Mental health
Main topic: Entertainment; Subtopic: Film and Television \n

If a text mentions politics, politicians names functions, parties, policy, or any other politics-related term, the main topic should always be Politics.\n


You must always return a main topic and a subtopic and nothing else in the following format: Main topic1 : Subtopic;, Main topic2: Subtopic
Do not return any notes. Only return the label and nothing more for each text.

<</SYS>>
"""

In [8]:
example_prompt_1 = """
I have a document pair of the following texts:\n
- Contact met de kiezer; Deze keer zie je geen lijsttrekkers in windjacks rondlopen op markten. Kandidaat-Kamerleden moeten noodgedwongen online contact zoeken met de kiezers. Zoals de lijsttrekker van de ChristenUnie, Gert-Jan Segers, te zien is op de bovenstaande afbeelding terwijl hij vragen beantwoordt die kiezers hem stellen op het online platform Instagram. In plaats van direct met de burgers te praten, spreken politici nu voor de camera's. Livestreams op Facebook zijn ook populair. Zo had Mark Rutte dit weekend een gesprek met horeca-ondernemers en zond de VVD dat uit op Facebook. Naast de online campagne werd er dit weekend ook op de ouderwetse manier geflyerd. Maar de meeste campagnevoerende partijleden gingen niet aanbellen uit angst voor verdere verspreiding van het coronavirus. Forum voor Democratie was de enige partij die de straat op ging om campagne te voeren. Met een vrijheidskaravaan bezocht de partij Nijmegen en Venlo voor een manifestatie. Toen er meer dan tweehonderd mensen kwamen opdagen, moest burgemeester Hubert Bruls de bijeenkomst voortijdig beëindigen, hoewel deze wel was aangekondigd en aangevraagd. Een bezoeker in Venlo twitterde dat het centrale plein voor het eerst sinds carnaval vorig jaar weer vol stond met de komst van Baudet.
- Forum voor Democratie op zoek naar extra stemmen; Terwijl andere partijen zich nauwelijks op straat vertonen, reist Forum voor Democratie stad en land af. Deze optredens trekken niet alleen de aandacht van kiezers; het Openbaar Ministerie onderzoekt nu ook of het campagneteam van Baudet de coronaregels op grote schaal heeft overtreden. Volgens getuigen werden bij een bezoek aan Urk honderden handen geschud. En dan was er nog de volmachtrel. In een live-uitzending riep Baudet zijn kijkers op om zoveel mogelijk volmachtstemmen te regelen, aangezien kiezers dit jaar niet twee, maar drie volmachtstemmen mogen uitbrengen om de kans op besmetting te verkleinen. "Een persoon kan eigenlijk vier keer stemmen, als je maar die volmachten kunt regelen," zei Baudet, en dat was een enorme kans. Maar het ministerie van Binnenlandse Zaken zei: "Ho, dat is niet de bedoeling en is niet toegestaan." Het campagneteam van Forum leek daar toen al achter te komen, want de suggestie om stemmen te regelen werd snel uit de video van Baudet geknipt. Baudet houdt echter wel openlijk vast aan zijn standpunt dat er een grote kans is op verkiezingsfraude, maar dan door anderen, uiteraard.

The topic of each text is described by the following keywords: 'livesessies', 'vrijheidskaravaan', 'flyerende', 'facebook', 'windjacks'; besmettingskansen', 'volmachtsstemmen', 'schielijk', 'volmachten', 'baudets'
The following proper nouns appear in each text: Gert-Jan Segers, Mark Rutte, Forum voor Democratie, Hubert Bruls, Baudet; Forum voor Democratie, Forum voor Democratie, Ministerie kijkt, Urk, Baudet, Baudet, Baudet, Baudet

Based on the information about the topic above, please create a short label of the topic for each text. Only return the label and nothing more for each text in the following format:

[/INST] Main topic 1: Politics; Subtopic: Elections and campaigns; Main topic 2: Politics; Subtopic: Elections, campaigns and fraud 

"""

In [9]:
main_prompt_1 = """
[INST]
I have a document pair of the following texts:
{text1} and {text2}

The topic of each text is described by the following keywords: {keywords1} and {keywords2}
The following proper nouns appear in each text: {proper_nouns1}, {proper_nouns2}

Based on the information about the topic above, please create a short label of this topic for each text. Only return the label and nothing more for each text in the following format: Main topic 1 : Subtopic ; Main topic 2: Subtopic
[/INST]
"""

In [10]:
prompt_1 = system_prompt_1 + example_prompt_1 + main_prompt_1

In [11]:
# Create a PromptTemplate instance
prompt_template = PromptTemplate(
    input_variables=["text1", "text2", 'proper_nouns1', 'proper_nouns2', 'keywords1', 'keywords2'],
    template=prompt_1
)

# Create the LLMChain instance
chain_1 = LLMChain(llm = llm, prompt = prompt_template, output_key="topics")

### Step 1.1: Extract main topic from topics for matching

¶This is a must otherwise the chain considers subtopics as the level of match and disregards those that match on a broad level

In [12]:
system_prompt_1_1 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for extracting the main topic from a topic. 
A topic comprises of a main topic and a subtopic. A main topic is an overarching theme, a subtopic is a more specific thematic or content-based divisions within a broader main topic.
Main topic: Economy; Subtopic: Interest rates
Main topic: Health; Subtopic: Mental health
Main topic: Entertainment; Subtopic: Film and Television 

A main topic is everything before the word 'Subtopic'
Given a topic, you must always return the main topic nothing else in the following format: Main topic1, Main topic2: 
Only return the main topic label and nothing more for each text.

<</SYS>>
"""

In [13]:
example_prompt_1_1 = """
I have a pair of topics:
Main topic 1: Politics; Subtopic: Elections and campaigns; \n
Main topic 2: Politics; Subtopic: Elections, campaigns and fraud \n

Based on the information about the topic above, please extract the main topic from each topic. A main topic is everything before the word 'Subtopic'. In this case this word is Politics. Only return the label of the main topic and nothing more in the following format:

[/INST] Main topic 1: Politics; Main topic 2: Politics

"""

In [14]:
main_prompt_1_1 = """
[INST]
I have a pair of topics:
{topics}

Based on the information about the topic above, please extract the main topic from each topic. Only return the label of the main topic and nothing more in the following format:
Main topic 1: ; Main topic 2: 
[/INST]
"""

In [15]:
prompt_1_1 = system_prompt_1_1 + example_prompt_1_1 + main_prompt_1_1

In [58]:
prompt_template_1 = PromptTemplate(
    input_variables=["topics"],
    template=prompt_1_1
)

# Create the LLMChain instance
chain_1_1 = LLMChain(llm = llm, prompt = prompt_template_1, output_key="main_topic")

## Step 2: Evaluate topic level match
Compare the topics of the text pairs and made eveluation about the match level  

* We create a second chain for this task that uses the texts as well as the extrcated topics as input

In [17]:
system_prompt_2 = """
<s>[INST] <<SYS>>
You are a helpful, respectful, and honest assistant for comparing the main topics of two texts. In this comparison, a match is solely based on the main topic and nothing else.

"""

In [18]:
example_prompt_2 = """

The main topic of each text is described by the following labels:

Main topic 1: Politics;  
Main topic 2: Politics; 


Based on the information about the main topics above, please write a short evaluation about whether the two texts match on a main topic level. Make sure to only return the evaluation and nothing more in the following format:

[/INST] Topic Evaluation: Yes, the two texts match on a main topic level because both texts touch upon the broader context of Politics. 
"""

In [19]:
main_prompt_2 = """
[INST]

The main topic of each text is the following: 
{main_topic}

Based on the information about the topics above, please write a short evaluation about whether the two texts match on a main topic level. Make sure to only return the evaluation and nothing more in the following format:
Topic Evaluation:
[/INST] 
"""

In [20]:
prompt_2 = system_prompt_2 + example_prompt_2 + main_prompt_2

In [21]:
prompt_template_2 = PromptTemplate(input_variables=["main_topic"], template=prompt_2, batch_size=32, max_iterations = 1)
chain_2 = LLMChain(llm = llm, prompt = prompt_template_2, output_key="topic_evaluation")


### Step 2.1 extract words used to make topic-level evaluation decision

In [22]:
system_prompt_2_1 = """
<s>[INST] <<SYS>>
You are a helpful, respectful, and honest assistant for comparing the main topics of two texts. In this comparison, a match is solely based on the main topic and nothing else.
Your task is to provide a list of comma separated keywords for each text that you used to make your evaluation. Only provide a list and nothing more. Avoid notes, thank yous and any other filler information. 

"""

In [23]:
example_prompt_2_1 = """

I have a document pair of the following texts:\n
- Contact met de kiezer; Deze keer zie je geen lijsttrekkers in windjacks rondlopen op markten. Kandidaat-Kamerleden moeten noodgedwongen online contact zoeken met de kiezers. Zoals de lijsttrekker van de ChristenUnie, Gert-Jan Segers, te zien is op de bovenstaande afbeelding terwijl hij vragen beantwoordt die kiezers hem stellen op het online platform Instagram. In plaats van direct met de burgers te praten, spreken politici nu voor de camera's. Livestreams op Facebook zijn ook populair. Zo had Mark Rutte dit weekend een gesprek met horeca-ondernemers en zond de VVD dat uit op Facebook. Naast de online campagne werd er dit weekend ook op de ouderwetse manier geflyerd. Maar de meeste campagnevoerende partijleden gingen niet aanbellen uit angst voor verdere verspreiding van het coronavirus. Forum voor Democratie was de enige partij die de straat op ging om campagne te voeren. Met een vrijheidskaravaan bezocht de partij Nijmegen en Venlo voor een manifestatie. Toen er meer dan tweehonderd mensen kwamen opdagen, moest burgemeester Hubert Bruls de bijeenkomst voortijdig beëindigen, hoewel deze wel was aangekondigd en aangevraagd. Een bezoeker in Venlo twitterde dat het centrale plein voor het eerst sinds carnaval vorig jaar weer vol stond met de komst van Baudet.
- Forum voor Democratie op zoek naar extra stemmen; Terwijl andere partijen zich nauwelijks op straat vertonen, reist Forum voor Democratie stad en land af. Deze optredens trekken niet alleen de aandacht van kiezers; het Openbaar Ministerie onderzoekt nu ook of het campagneteam van Baudet de coronaregels op grote schaal heeft overtreden. Volgens getuigen werden bij een bezoek aan Urk honderden handen geschud. En dan was er nog de volmachtrel. In een live-uitzending riep Baudet zijn kijkers op om zoveel mogelijk volmachtstemmen te regelen, aangezien kiezers dit jaar niet twee, maar drie volmachtstemmen mogen uitbrengen om de kans op besmetting te verkleinen. "Een persoon kan eigenlijk vier keer stemmen, als je maar die volmachten kunt regelen," zei Baudet, en dat was een enorme kans. Maar het ministerie van Binnenlandse Zaken zei: "Ho, dat is niet de bedoeling en is niet toegestaan." Het campagneteam van Forum leek daar toen al achter te komen, want de suggestie om stemmen te regelen werd snel uit de video van Baudet geknipt. Baudet houdt echter wel openlijk vast aan zijn standpunt dat er een grote kans is op verkiezingsfraude, maar dan door anderen, uiteraard.

The main topic of each text is described by the following labels:

- Main topic 1: Politics;  
- Main topic 2: Politics; 

The following evaluation describes whether the two texts match on a main topic level:
- Topic Evaluation: Yes, the two texts match on a main topic level because both texts touch upon the broader context of Politics as seen by their main Topic. 


Based on the information above please provide a list of comma separated words that indicate the words that best describe the topic of each text. Make sure to only return the comma separated list of words and nothing else in the following format:
[/INST] Contact met de kiezer, Kandidaat-Kamerleden, Online contact, Instagram, Politici voor de camera's, Livestreams op Facebook, Flyeren, Angst voor verspreiding van het coronavirus, Forum voor Democratie, Vrijheidskaravaan, Burgemeester Hubert Bruls, Bezoeker in Venlo, Centrale plein, Baudet; Forum voor Democratie, Optredens, Openbaar Ministerie, Coronaregels overtreden, Bezoek aan Urk, Handen schudden, Volmachtrel, Volmachtstemmen regelen, Ministerie van Binnenlandse Zaken, Verkiezingsfraude
"""

In [24]:
main_prompt_2_1 = """
[INST]

I have a document pair of the following texts:
{text1} and {text2}

The main topic of each text is the following: 
{main_topic}

The following evaluation describes whether the two texts match on a main topic level:
{topic_evaluation}

Based on the information above please provide a list of comma separated words that you used to make this evaluation. Make sure to only return the comma separated list of words and nothing else in the following format:
[/INST]
"""

In [25]:
prompt_2_1 = system_prompt_2_1 + example_prompt_2_1 + main_prompt_2_1
prompt_template_2_1 = PromptTemplate(input_variables=["text1", "text2", "main_topic", "topic_evaluation" ], template=prompt_2_1, batch_size=32, max_iterations = 1)
chain_2_1 = LLMChain(llm = llm, prompt = prompt_template_2_1, output_key="topic_words")


In [59]:
overall_chain = SequentialChain(
    chains=[chain_1, chain_1_1, chain_2, chain_2_1], input_variables = ["text1", "text2", "proper_nouns1", "proper_nouns2", 'keywords1', 'keywords2'],output_variables=["topics", "main_topic", "topic_evaluation", "topic_words"],
    verbose=True )

# Test if it works
for index, row in df2.iterrows():
    full_text1 = row['Text1']  
    full_text2 = row['Text2']  

    input_variables = {
        "text1": full_text1,
        "text2": full_text2,
        "proper_nouns1": row['proper_nouns1'],
        "proper_nouns2": row['proper_nouns2'],
        "keywords1": row['keywords1'],
        "keywords2": row['keywords2'],
    }

    # Generate text using the chain
    generated_text = overall_chain(input_variables)
    
    print(generated_text)

## Step 3: Create classification label based on evaluation
Provide single label for the match level

* We create a third chain for this task that uses the texts as well as the extracted topics and evaluation as input.
* Labels: topic match, not topic match

In [26]:
system_prompt_3 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for classifying whether two texts match on a main topic level based on an evaluation provided. 

At each request excplicitly assign one of the two labels below. 
0 - no match
1 - topic match

Make sure you to only return the label and nothing else.
<</SYS>>
"""

In [27]:
example_prompt_3 = """

The following evaluation describes the topic match level:
Yes, the two texts match on a main topic level. Both texts touch upon the broader context of Politics. 
Based on this information, please assign either '0' for no match or '1' for topic match'. Return 0 for no match, and 1 for match. Make sure to only return the label and nothing more in the following format:

[/INST]: 1 
"""

In [28]:
main_prompt_3 = """
[INST]

The following evaluation describes the topic match level:
{topic_evaluation}

Based on this information, please assign either '0' for no match or '1' for topic match'. Return 0 for no match, and 1 for match. Make sure to only return the label and nothing more in the following format:

[/INST] 
"""

In [29]:
prompt_3 = system_prompt_3 + example_prompt_3 + main_prompt_3

In [30]:
prompt_template_3 = PromptTemplate(input_variables=[ "topic_evaluation"], template=prompt_3, batch_size=32, max_iterations = 1)
chain_3 = LLMChain(llm = llm, prompt = prompt_template_3, output_key="match_topic")


## Step 4: Identify news events
* we ask the model the identify the news event described in each text
* input data remains the same
* this is in preparation of assessing news event level matching similar to topic level matching

In [31]:
system_prompt_4 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for idenitifying the news event described in a pair of documents. 
News events are specific events that lead to news coverage, such as a specific debate on a specific day in a specific parliament, a specific accident, or a specific football match. They can be covered by one or more articles in one or more outlets, but relate to one specific and identifiable event and are thus much more fine-grained than news topics, issues, or news categories.
News events can span over multiple days but not more than 10 days. Therefore articles that cover the same news event are published within the same few hours and in the course of a few days. 

Provide your answer as an explanation in maximum 100 tokens. Make sure you to only return the news evene tidentiief and nothing else.
<</SYS>>
"""

In [32]:
example_prompt_4 = """
I have a document pair of the following texts:
- Contact met de kiezer; Geen flyerende lijsttrekkers in windjacks op markten deze keer. Kandidaat-Kamerleden zoeken noodgedwongen hun contact met de kiezer online. Zoals ChristenUnie-lijsttrekker Gert-Jan Segers, hierboven afgebeeld terwijl hij vragen beantwoordt die kiezers hem op online platform Instagram stellen. Anders dan praten met de burgers zelf, praten de politici nu tegen camera's. Populair zijn ook livesessies op Facebook. Zo ging Mark Rutte dit weekend in gesprek met horeca-ondernemers en zond de VVD dat uit op Facebook. Naast de online campagne, werd er dit weekend ook ouderwets geflyerd. Maar aanbellen, dat deden de meeste campagnevoerende partijleden niet, uit angst voor verdere verspreiding van het coronavirus. Forum voor Democratie ging er als enige partij wel op uit om campagne te voeren. Met een vrijheidskaravaan deed de partij Nijmegen en Venlo aan voor een manifestatie. Toen er meer dan tweehonderd mensen langskwamen, moest burgemeester Hubert Bruls de bijeenkomst, die wel aangekondigd en aangevraagd was, voortijdig afbreken. Een bezoeker in Venlo twitterde dat met de komst van Baudet het centrale plein voor het eerst sinds carnaval vorig jaar weer vol stond.
- Forum voor Democratie Jacht op extra stemmen; Waar andere partijen zich nauwelijks op straat wagen toert Forum voor Democratie stad en land af. Die optredens trekken niet alleen de aandacht van kiezers het Openbaar Ministerie kijkt inmiddels of Baudets campagneteam niet op grote schaal de coronaregels geschonden heeft. Zo werden bij een bezoek aan Urk volgens getuigen honderden handen geschud. En dan was er nog de volmacht-rel. In een live-uitzending riep Baudet donderdag zijn kijkers op zoveel mogelijk stemmen per volmacht te regelen. Om de besmettingskansen te verkleinen mogen kiezers dit jaar niet twee maar drie volmachtsstemmen uitbrengen. Een persoon kan vier keer stemmen eigenlijk, als je maar die volmachten kunt regelen, aldus Baudet, en dat was een enorme kans. Ho, zei het ministerie van Binnenlandse Zaken dat is niet de bedoeling en mag helemaal niet. Daar leek het campagneteam van Forum toen al achter gekomen de suggestie om stemmen te regelen was door de partij schielijk uit het filmpje van Baudet geknipt. Aan een ander standpunt houdt Baudet wel openlijk vast de grote kans op verkiezingsfraude. Door anderen, uiteraard.

The following keywords appear in each text: 'livesessies', 'vrijheidskaravaan', 'flyerende', 'facebook', 'windjacks'; besmettingskansen', 'volmachtsstemmen', 'schielijk', 'volmachten', 'baudets'
The following proper nouns appear in each text: Gert-Jan Segers, Mark Rutte, Forum voor Democratie, Hubert Bruls, Baudet; Forum voor Democratie, Forum voor Democratie, Ministerie kijkt, Urk, Baudet, Baudet, Baudet, Baudet

The topic of each text is the following:
Main topic 1: Politics; Subtopic: Elections and campaigns; \n
Main topic 2: Politics; Subtopic: Elections, campaigns and fraud \n

Based on the information above, please identify the news events that describe each document. Make sure to only return the news events and nothing more in the following format:

[/INST] Event 1: Most political parties are shifting their campaign activity stategies due to the COVID-19 pandemic; Event 2: Forum voor Democratie party's campaign activities violate COVID-19 regulations while other parties have more pandemic-proof startegies. 

"""

In [33]:
main_prompt_4 = """

I have a document pair of the following texts:
{text1} and {text2}

The following keywords appear in each text: {keywords1} and {keywords2}
The following proper nouns appear in each text: {proper_nouns1}, {proper_nouns2}

The topic of each text is the following:
{topics}

Based on the information above, please identify the news events that describe each document. Make sure to only return the news events and nothing more in the following format:
[/INST] 

"""

In [34]:
prompt_4 = system_prompt_4 + example_prompt_4 + main_prompt_4

In [35]:
prompt_template_4 = PromptTemplate(input_variables=["text1", "text2", "proper_nouns1", "proper_nouns2", 'keywords1', 'keywords2', "topics" ], template=prompt_4, batch_size=32, max_iterations = 1)
chain_4 = LLMChain(llm = llm, prompt = prompt_template_4, output_key="news_events")

## Step 5: Evaluate news event level match

* we ask the model to compare the news events identified on whether they match 
* input data remains the same plus the dates

In [36]:
system_prompt_5 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for evaluating whether two texts pertain to the same news event.
News events are comprised of specific events that lead to news coverage around a news story, such as a specific debate on a specific day in a specific parliament, a specific accident, or a specific football match. \n
They can be covered by one or more articles in one or more outlets, but relate to one specific and identifiable event and are thus much more fine-grained than news topics, issues, or news categories.\n
News events can span over multiple days but not more than 10 days. Therefore articles that cover the same news event are published very close in time, a matter of hours or maximum a few days. 
Different news events can also be published on the same date or on a very close date. \n
The most important criteria for determining whether the two texts pertain to the same news event are the events mentioned in the text. The date overlap is a secondary objective. \n

An event within a news event must refer to a specific event or related developments around the event. A news event can include various articles, reports, and updates from news outlets, all contributing to the coverage of that specific event or issue and its sorrounding aspects. \n
For example, the news event might revolve around the presidential election of a specific year, detailing campaign events, candidate profiles, polling data, and key issues.
Provide your answer as an explanation in maximum 100 tokens. Make sure to only return the evaluation and nothing else.
<</SYS>>
"""

In [37]:
example_prompt_5 = """

The news events of each text is the following:
Event 1: Most political parties are shifting their campaign activity stategies due to the COVID-19 pandemic.\n
Event 2: Forum voor Democratie party's campaign activities violate COVID-19 regulations while other parties have more pandemic-proof startegies. \n


The pubishing dates of the texts is the following:\n
date1: 01/03/2021; date2: 01/03/2021  \n

Based on the information above, please write a short evaluation about whether the two texts match on a news event level. Make sure to only return the evaluation and nothing more in the following format:\n

[/INST] Event Evaluation: Both texts focus on one particular news event, the election campaign and party campaign activities amid the COVID-19 pandemic which is distintive event. Both texts discuss aspects of the same election campaign, political parties and campaign strategies during the pandemic indicating that they pertain to the same news event.
The texts were also published at a similar time and date which further indicates that they belong to the same news event. 
"""

In [38]:
main_prompt_5 = """

The news events of each text is the following:
{news_events}

The pubishing dates of the texts is the following:
{date1} and {date2}

Based on the information above, please write a short evaluation about whether the two texts match on a news event level. Make sure to only return the evaluation and nothing more in the following format:
Event Evaluation:
[/INST] 

"""

In [39]:
prompt_5 = system_prompt_5 + example_prompt_5 + main_prompt_5

In [40]:
prompt_template_5 = PromptTemplate(input_variables=["news_events", "date1", "date2"], template=prompt_5, batch_size=32, max_iterations = 1)
chain_5 = LLMChain(llm = llm, prompt = prompt_template_5, output_key="event_evaluation")


## Step 6: Create classification label based on evaluation 

Similar to what we do for the topic level match

In [41]:
system_prompt_6 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for classifying whether two texts match on a news events level based on an evaluation provided. 

At each request excplicitly assign one of the two labels below. 
0 - no match
1 - event match

Make sure you to only return the label and nothing else.
<</SYS>>
"""

In [48]:
example_prompt_6 = """

The following evaluation describes the news event match level:

Both texts focus on one particular news event, the election campaign which is distintive event. Both texts discuss aspects of the same election campaign.\n
The texts were also published at a similar time date which further indicates that they belong to the same news event. 

Based on this information, please assign either '0' for no match or '1' for news event match. Return 0 for no match, and 1 for match. Make sure to only return the label and nothing more in the following format:

[/INST]: 1 
"""

In [49]:
main_prompt_6 = """
[INST]

The following evaluation describes the news event match level:
{event_evaluation}

Based on this information, please assign either '0' for no match or '1' for news event match'. Return 0 for no match, and 1 for match. Make sure to only return the label and nothing more in the following format:
[/INST] 
"""

In [50]:
prompt_6 = system_prompt_6 + example_prompt_6 + main_prompt_6

In [51]:
prompt_template_6 = PromptTemplate(input_variables=["event_evaluation"], template=prompt_6, batch_size=32, max_iterations = 1)
chain_6 = LLMChain(llm = llm, prompt = prompt_template_6, output_key="match_event")


## Run the overall chain and save the results into the final df 


In [45]:
# Function to navigate up 'n' levels
def navigate_up(current_directory, levels):
    for _ in range(levels):
        current_directory = os.path.dirname(current_directory)
    return current_directory

# Get the current working directory
current_directory = os.getcwd()

# Specify the number of levels to navigate up (4 levels in this case)
levels_to_navigate = 4

# Navigate up 'levels_to_navigate' folders
parent_directory = navigate_up(current_directory, levels_to_navigate)

# Define the path to the data file
file_path = os.path.join(parent_directory, 'newspaper_data', 'final_2.csv')

# Now you can open and read the CSV file using pandas

df = pd.read_csv(file_path)

In [46]:
df

Unnamed: 0,Similarity_Score,Text1,Text2,Group,Date1,Date2,Publisher1,Publisher2,ID1,ID2,proper_nouns1,proper_nouns2,keywords1,keywords2
0,0.646155,Geweld bij antilockdowndemo in Dublin; Bij een...,Esther Ouwehand had niets met de plek waar ze ...,medium,2021-02-27 20:30:26,2021-03-01 00:00:00,NOS liveblog,Algemeen Dagblad,3287531,3286186,"Geweld, Dublin, Leo Varadkar","Esther Ouwehand, Vinex, Esther Ouwehand, Uranu...","['ongeregeldheden', 'antilockdowndemo', 'wapen...","['hoornespolder', 'neptunus', 'rijtjeswoningen..."
1,0.760930,Stelling 5: gevaccineerde burgers moeten als e...,Sportscholen demonstratief open: 'Bewegen is n...,high,2021-02-28 23:18:22,2021-03-01 00:00:00,NOS liveblog,Trouw,3287467,3285332,"Wilders, Marijnissen, Marijnissen, Klaver van ...","Tino Hoogendijk Sport, J","['sneltesten', 'gevaccineerden', 'gevaccineerd...","['instructeur', 'hometrainers', 'housebeat', '..."
2,0.820325,Vraag van de eigenaresse van een couscousbar a...,De uitzending van 1 maart: Gasvrij duurder dan...,high,2021-02-28 22:43:49,2021-03-01 12:06:47,NOS liveblog,Nieuwsuur,3287471,6290579,"Wilders, Nadia, Wilders, Wilders bestrijdt, Na...","Friese Garijp, Noord-Holland Noord, Jan Nieuwe...","['onwenselijk', 'couscousbar', 'couscous', 'ne...","['nieuwenburg', 'gasvrij', 'besmettingen', 'aa..."
3,0.774437,Stelling 4: de rekening van de coronacrisis mo...,Helft potentiële Forumstemmers vermoedt wereld...,high,2021-02-28 22:48:57,2021-03-01 00:00:00,NOS liveblog,De Volkskrant,3287470,3285229,"Marijnissen, CDA, VVD","Forum voor Democratie, Ipsos, Baudet, Urk","['inkomensongelijkheid', 'marijnissen', 'belas...","['ipsos', 'forumstemmers', 'kiesgerechtigden',..."
4,0.731456,Stelling 2: minimaal 10 procent van de bewind...,CPB-doorrekening: risico op maken van uitglije...,high,2021-02-28 22:03:24,2021-03-01 00:00:00,NOS liveblog,Het Financieele Dagblad,3287477,3290566,"GroenLinks, SP, D66, Wilders, Zwarte Piet, Fri...","Is de invloed, CPB, Jean Dohmen, Centraal Plan...","['ronald', 'hilariteit', 'zit', 'westerse', 'm...","['doorrekeningen', 'uitglijer', 'berekeningen'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,0.774394,Gods politiek; Stevo Akkerman Zonder enige waa...,Milieuclubs niet echt blij met doorrekeningen;...,high,2021-03-01 00:00:00,2021-03-01 12:57:21,Trouw,NOS liveblog,3285316,6290507,"Stevo Akkerman Zonder, ChristenUnie, GPV, Albe...","Milieuclubs, Kamers, CDA, GroenLinks, D66, Pvd...","['verkiezingsaffiche', 'albertus', 'heilloze',...","['greenpeace', 'nagestreefde', 'doorrekeningen..."
491,0.844178,Loonkloof is nog altijd niet gedicht; Stond he...,"De doorrekening van de verkiezingsprogramma's,...",high,2021-03-01 00:00:00,2021-03-01 07:12:10,Algemeen Dagblad,NOS nieuws,3286188,6290567,"Loonkloof, VVD, CDA, D66, GroenLinks, euro, SP",Planbureau,"['loonkloof', 'eindsalaris', 'salarissen', 'st...","['klimaatdoelen', 'doorrekeningen', 'verkiezin..."
492,0.682143,Laatste coronanieuws: 4729 nieuwe besmettingen...,"Manifestatie FvD afgebroken, aanhangers bekeur...",medium,2021-02-28 00:00:00,2021-02-28 13:43:06,Het Financieele Dagblad,NOS liveblog,3290679,3287500,"RIVM, RIVM, Pati, RIVM, RIVM, RIVM mede bepaald","Forum voor Democratie, Thierry Baudet","['meldt', '4605', 'coronabesmettingen', 'besme...","['meldt', 'tegendemonstranten', 'opruiing', 'b..."
493,0.845664,Een boer versus D66-leider Kaag over de halver...,Scholen vragen zich af: waar blijven de snelte...,high,2021-02-28 21:41:40,2021-03-01 00:00:00,NOS liveblog,Trouw,3287479,3285335,D66,"CNV, Kleuters, CNV, AOb, Algemene Vereniging S...","['stikstofreductie', 'veestapel', 'langsgaan',...","['sneltesten', 'sneltests', 'amsterdamse', 'sc..."


In [None]:
import time


# Create empty lists to collect the results
pd.options.mode.chained_assignment = None  # Disable the warning (not recommended)

topics = []
topic_eval = []
match_topic = []
topic_words = []
news_events = []
event_eval = []
match_event = []

# Iterating over the DataFrame
for index, row in df.iterrows():
    full_text1 = str(row['Text1'])  
    full_text2 = str(row['Text2'])  

    input_variables = {
        "text1": full_text1,
        "text2": full_text2,
        "proper_nouns1": row['proper_nouns1'],
        "proper_nouns2": row['proper_nouns2'],
        "keywords1": row['keywords1'],
        "keywords2": row['keywords2'],
        "date1": row['Date1'],
        "date2": row['Date2']
    }


    start_time = time.time()
    
    # Process the input_variables and get the results
    #create overall chain to combine previous chains into one big sequential chain
    overall_chain = SequentialChain(
                  chains=[chain_1, chain_1_1, chain_2, chain_2_1, chain_3, chain_4, chain_5, chain_6], input_variables = ["text1", "text2", "proper_nouns1", "proper_nouns2", 'keywords1', 'keywords2', 'date1', 'date2'],output_variables=["topics", "main_topic", "topic_evaluation", "topic_words","match_topic","news_events","event_evaluation", "match_event"],
                  verbose=True )
    
    results = overall_chain(input_variables)
    # Stop the timer
    end_time = time.time()


    # Append results to respective lists
    topics.append(results['topics'].strip())
    match_topic.append(results['match_topic'].strip())
    topic_words.append(results['topic_words'].strip())
    topic_eval.append(results['topic_evaluation'].strip())
    news_events.append(results['news_events'].strip())
    event_eval.append(results['event_evaluation'].strip())
    match_event.append(results['match_event'].strip())

    # Calculate and print the time taken for processing this row
    row_processing_time = end_time - start_time
    print(f"Processed row {index} in {row_processing_time:.2f} seconds")

# Add new columns to the DataFrame
df.loc[:,'Topic'] = topics
df.loc[:,'Topic_eval'] = topic_eval
df.loc[:,'Topic_match'] = match_topic
df.loc[:,'Topic_words'] = topic_words
df.loc[:,'News_events'] = news_events
df.loc[:,'Event_eval'] = event_eval
df.loc[:,'Event_match'] = match_event

# Add a short pause before moving on to the next iteration
#time.sleep(0.01)



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Processed row 0 in 15.56 seconds


[1m> Entering new SequentialChain chain...[0m





[1m> Finished chain.[0m
Processed row 1 in 13.28 seconds


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Processed row 2 in 18.72 seconds


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Processed row 3 in 17.16 seconds


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Processed row 4 in 17.09 seconds


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Processed row 5 in 15.01 seconds


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Processed row 6 in 17.56 seconds


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Processed row 7 in 17.41 seconds


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Processed row 8 in 16.04 seconds


[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m
Processed row 9 in 15.49 seconds


[1m> Entering new SequentialChain chain...[0m

[1m> Fin

In [None]:
# Define the path to the folder 4 levels up where you want to save the DataFrame
save_folder_path = os.path.join(parent_directory, 'initial_out')

# Create the folder if it doesn't exist
os.makedirs(save_folder_path, exist_ok=True)

# Define the full path to save the DataFrame
save_file_path = os.path.join(save_folder_path, '7b_assisted.csv')
# Save the DataFrame to the specified path
df.to_csv(save_file_path, index=False)

df['Topic_words'].unique()