In [1]:
# extracting sentences
import fitz  # PyMuPDF
import nltk
import random
import glob
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

# pair generation
import openai
import json
from category_types import *
import time

### **Extract sentences from documents**

In [None]:
# Download punkt tokenizer
nltk.download('punkt')

In [None]:
# break content of pdf files into individual sentences
folder_path = "/home/ssever/ContraDoc/data/PDFs"

pdf_files = glob.glob(f"{folder_path}/*.pdf")

all_sentences = []

for pdf_file in pdf_files:
    full_text = ""
    doc = fitz.open(pdf_file)
    for page in doc:
        full_text += page.get_text()
    
    doc.close()

    sentences = nltk.tokenize.sent_tokenize(full_text)

    all_sentences.append(sentences)


In [None]:
# remove all newline characters

flattened_and_cleaned_list = [item.replace("\n", "") for sublist in all_sentences for item in sublist]
print(f"Number of sentences:", len(flattened_and_cleaned_list))

In [None]:
# Store all sentences in text file
filepath = "/home/ssever/ContraDoc/data/text_files/all_sentences"

# Open the file in write mode ('w') and write each sentence to the file
with open(filepath, 'w') as file:
    for sentence in flattened_and_cleaned_list:
        file.write(sentence + '\n')  # Add '\n' to ensure each sentence is on a new line

print(f"Sentences have been written to {filepath}.")

In [None]:
# opening the file in read mode 
my_file = open("/home/ssever/ContraDoc/data/text_files/all_sentences", "r") 

# reading the file 
all_sentences = my_file.read()
all_sentences = all_sentences.split('\n')

my_file.close()

len(all_sentences)

In [None]:
# Extract all sentences in token length between 10 and 45

filtered_sentences = [sentence for sentence in all_sentences if len(nltk.word_tokenize(sentence)) <= 45 and len(nltk.word_tokenize(sentence)) >= 10]
print(f"Number of filtered sentences:", len(filtered_sentences))

In [None]:
# Get 3000 random sentences for pair generation
random_sentences = random.sample(filtered_sentences, 3000)

In [None]:
# Find sentence with the least amount of tokens

min_tokens = float('inf')
sentence_with_least_tokens = ""

for sentence in filtered_sentences:
    tokens = nltk.word_tokenize(sentence)

    if len(tokens) < min_tokens:
        min_tokens = len(tokens)
        sentence_with_least_tokens = sentence

In [None]:
print(f"Sentence with the most tokens: \"{sentence_with_least_tokens}\"")
print(f"Number of tokens: {min_tokens}")

### **Create pairs for training set**

In [3]:
# category types used for pair generation
category_types = [antonym, negation, numeric, factive_embedded_verb, factive_antonym, structure, lexical, temporal, wk]

In [None]:
# Choose sentences for category pair generation
train_premises = random_sentences[:1500]

In [None]:
# Initialize GPT api
api_key = os.getenv("API_KEY")
openai.api_key = (api_key)
model= 'gpt-4-turbo-preview'
max_tokens = 1024
temperature = 1

In [None]:
"""Main cell: Contains prompt for pair generation and thus generates all pairs for the NLI dataset"""

all_responses=[]

num_index = len(category_types)
index = 0

for premise in train_premises:
    if index == num_index:
            index = 0
    response=[]
    res = openai.ChatCompletion.create(
              model=model,
              max_tokens=max_tokens,
              temperature = temperature,
              messages=[{"role": "system", "content": "You are an expert on semantics and linguistics, with a profound knowledge\
              in Natural Language Processing. You are especially aware of the work by Marneffe et al., classifying\
              different types of contradictions, such as antonyms, negations, numerical mismatches, factive, structural, lexical, and world knowledge contradictions. To this end,\
              a contradiction is defined as a mismatch between two statements, such that they cannot possibly both be true.\
              It is assumed, that both statements refer to the same fact or event, even if this is not explicitly stated. The premise is provided,\
              you have to create a hypothesis of one of the contradiction types for this premise."},
              {"role": "user", "content": f"Please generate one contradictory hypothesis for a {premise}, based on {category_types[index].description}. The contradictions\
              hould be original and reasonably different from each other.\
              Format your response in the following way: {category_types[index].name} P: [PREMISE]. H: [HYPOTHESIS]. Make sure to include {category_types[index].name}"},
              {"role": "assistant", "content": category_types[index].description}],
            )

    response.append(res["choices"][0]["message"]["content"])
    index += 1
    time.sleep(2)

In [None]:
# contradiction prompt:

[{"role": "system", "content": "You are an expert on semantics and linguistics, with a profound knowledge\
in Natural Language Processing. You are especially aware of the work by Marneffe et al., classifying\
different types of contradictions, such as antonyms, negations, numerical mismatches, factive, structural, lexical, and world knowledge contradictions. To this end,\
a contradiction is defined as a mismatch between two statements, such that they cannot possibly both be true.\
It is assumed, that both statements refer to the same fact or event, even if this is not explicitly stated.The premise is provided,\
you have to create a hypothesis of one of the contradiction types for this premise."},
{"role": "user", "content": f"Please generate one contradictory hypothesis for a {premise}, based on {category_types[index].description}. The contradictions\
should be original and reasonably different from each other.\
Format your response in the following way: {category_types[index].name} P: [PREMISE]. H: [HYPOTHESIS]. Make sure to include {category_types[index].name}"},
{"role": "assistant", "content": category_types[index].description}]

# entailment, neutral prompt:

[{"role": "system", "content": "You are an expert on semantics and linguistics, with a profound knowledge\
in Natural Language Processing. You are aware of the work of classifying entailments and neutral pairs of statements. To this end,\
an entailment is defined in that two statements are entailed if the truth of the second statement follows from the truth of the first statement.\
Statements of neutral pairs do neither entail nor contradict each other.\
In the case of entailment it is assumed, that both statements refer to the same fact or event, even if this is not explicitly stated.\
The Premise is provided, you have to create a hypothesis for this premise."},
{"role": "user", "content": f"Please generate one hypothesis for a {premise}, based on {category_types[index].description}. The hypotheses\
should be original and reasonably different from each other.\
Format your response in the following way: {category_types[index].name} P: [PREMISE]. H: [HYPOTHESIS]. Make sure to include {category_types[index].name}"},
{"role": "assistant", "content": category_types[index].description}]

# numeric mimsatch prompt:

[{"role": "system", "content": "You are an expert on semantics and linguistics, with a profound knowledge\
 in Natural Language Processing. You are especially aware of contradictions, such as numerical mismatches. To this end, A contradiction based on a numerical mismatch means that a contradiction arises\
 between two statements (Premise and Hypothesis) because there are mismatching numbers in premise and hypothesis. The contradiction only changes the numerical values. Don't change anything else in the text.\
 The premise is provided, you have to create a hypothesis for a numerical mismatch for this premise."},
 {"role": "user", "content": f"Please generate numerical mismatch hypothesis for a {premise}, based on {category_types[index].description}.\
 Format your response in the following way: {category_types[index].name} P: [PREMISE]. H: [HYPOTHESIS]. Make sure to include {category_types[index].name}"},
 {"role": "assistant", "content": category_types[index].description}]

# structure prompt:

[{"role": "system", "content": "You are an expert on semantics and linguistics, with a profound knowledge\
 in Natural Language Processing. You are especially aware of structural contradictions. To this end, A structural contradiction means that a contradiction arises\
 between two statements (Premise and Hypothesis) because there is a mismatch in the sentence structure. The contradiction only changes the sentence structure.\
 Don't change or add anything to the verb of the sentence. Don't change the subject of the phrase. Only change the object or the subject of the phrase.\
 The premise is provided, you have to create a hypothesis for a structural mismatch for this premise."},
 {"role": "user", "content": f"Please generate a structural mismatch hypothesis for a {premise}, based on {category_types[index].description}. An example of a structural mismatch is given in {category_types[index].instances}\
 Format your response in the following way: {category_types[index].name} P: [PREMISE]. H: [HYPOTHESIS]. Make sure to include {category_types[index].name}"},
 {"role": "assistant", "content": category_types[index].description}]

#### **Store pairs in csv file**

In [None]:
import csv

# Split sentence pairs into premise, hypothesis and label

split_data = []
for item in response:
    for i in item:
        if not i.startswith('P'):
            parts = i.split(' P: ')
            label = parts[0]
            premise_hypothesis = parts[1].split(' H: ')
            premise = premise_hypothesis[0]
            hypothesis = premise_hypothesis[1]
            split_data.append([premise, hypothesis, label])

with open('/home/ssever/ContraDoc/data/csv_files/gpt_contradictions.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["premise", "hypothesis", "label"])
    writer.writerows(split_data[:167])

##### **Combine contradictions CSV file with entailment and neutral CSV file**

In [None]:
import pandas as pd

# Load the two CSV files
file1_path = '/home/ssever/ContraDoc/data/csv_files/gpt_contradictions.csv'
file2_path = '/home/ssever/ContraDoc/data/csv_files/gpt_entail_neutral.csv'

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Append the rows of the second dataframe to the first dataframe
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined dataframe to a new CSV file
combined_csv_path = '/home/ssever/ContraDoc/data/csv_files/combined_data_set.csv'
combined_df.to_csv(combined_csv_path, index=False)

print(f'Combined CSV saved to {combined_csv_path}')

#### **Clean and transform csv file**

In [None]:
# Add unique IDs to dataset

import pandas as pd
import numpy as np
import random
import string

# Function to generate a unique alphanumeric id
def generate_unique_id(length=10):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# CSV is loaded into a DataFrame
df = pd.read_csv('/home/ssever/ContraDoc/data/csv_files/combined_data_set.csv')

# Generate uniqze ids
unique_ids = set()
while len(unique_ids) < len(df):
    unique_ids.add(generate_unique_id())

# Insert ids into table
df.insert(0, 'id', list(unique_ids))

# make all labels lowercase
df['label'] = df['label'].str.lower()

# align elements to the left
styled_df = df.style.set_properties(**{'text-align': 'left'})
styled_df.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

# Insert into CSV file
df.to_csv('/home/ssever/ContraDoc/data/csv_files/combined_data_set.csv', index=False)

In [None]:
# Change label string values to numeric values

df = pd.read_csv('/home/ssever/ContraDoc/data/csv_files/combined_data_set.csv')

# Strip leading and trailing spaces from the 'label' column
df['label'] = df['label'].str.strip()

# Identify unique words and sort them to maintain consistency
unique_words = sorted(df['label'].unique())

# Create a mapping from words to digits
word_to_digit = {word: i for i, word in enumerate(unique_words)}

# Apply the mapping to the 'label' column
df['label_digit'] = df['label'].map(word_to_digit)

# Rename the original 'label' column to 'label_string'
df.rename(columns={'label': 'label_string', 'label_digit': 'label'}, inplace=True)

# Adjusting the column order, ensuring 'label_string' is positioned next to 'label'
columns_order = ['id', 'premise', 'hypothesis', 'label', 'label_string'] + [col for col in df.columns if col not in ['id', 'premise', 'hypothesis', 'label', 'label_string']]

# Shuffle the DataFrame
shuffled_df = df.sample(frac=1).reset_index(drop=True)

# Save the modified DataFrame back to a CSV
shuffled_df.to_csv('/home/ssever/ContraDoc/data/csv_files/nli_data_set.csv', index=False)