In [1]:
# Loader
from langchain_community.document_loaders import PyPDFLoader
from PyPDF2 import PdfReader

# Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

# Store
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import Chroma

# Tokenization
from transformers import AutoTokenizer

# Language model
from transformers import RobertaForSequenceClassification

# torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset, TensorDataset
from torch.nn.functional import softmax

# Additional libraries
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import time
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

#### **Load and split new document**

In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 400, chunk_overlap = 100, add_start_index = False) # splits the text into chunks
loader = PyPDFLoader("/home/ssever/contradiction-detection/data/Inject/Press_release_q1_2024.pdf").load()
split_text = text_splitter.split_documents(loader)

In [3]:
chunks = [doc.page_content for doc in split_text]
print(f"Number of chunks in new document: {len(chunks)}")

Number of chunks in new document: 32


#### **Similarity search**

In [4]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L12-v2")
db = Chroma(persist_directory="/home/ssever/contradiction-detection/src/vectorization/vector_store/kb_langchain", embedding_function=embedding_function)

In [5]:
knowledge_chunks = []
k = 2
for chunk in chunks:
    docs = db.similarity_search_with_score(chunk, k=k)
    #print(docs[0][1])
    #if docs[0][1] <= 0.5:
    #    print(docs[0][0])

    for doc, score in docs:
        knowledge_chunks.append(doc.page_content)

(f"Number of similar chunks in knowledge base: {len(knowledge_chunks)}")

'Number of similar chunks in knowledge base: 64'

#### **Create and clean pairs**

In [6]:
# Cleaning each premise
cleaned_premises = [
    re.sub(r'\s+\.', '.',  # Remove space before period
           re.sub(r'\s+,', ',',  # Remove space before comma
                  re.sub(r'\n', '', chunk)))  # Remove new lines
    for chunk in chunks]

print(len(cleaned_premises))

32


In [7]:
# Cleaning each hypothesis
cleaned_hypotheses = [
    re.sub(r'\s+\.', '.',  # Remove space before period
           re.sub(r'\s+,', ',',  # Remove space before comma
                  re.sub(r'\n', '', chunk)))  # Remove new lines
    for chunk in knowledge_chunks]

print(len(cleaned_hypotheses))

64


In [None]:
#similar_pairs = list(zip(cleaned_premises, cleaned_hypotheses))
#print(f"Number of chunk pairs: {len(similar_pairs)}")

In [8]:
# Result list
similar_pairs = []
num_pairs = k
# Loop through the first list
for i in range(len(cleaned_premises)):
    # Create pairs for the specified number of corresponding strings from list2
    for j in range(num_pairs):
        pair = (cleaned_premises[i], cleaned_hypotheses[num_pairs*i + j])
        similar_pairs.append(pair)

print(f"Number of chunk pairs: {len(similar_pairs)}")

Number of chunk pairs: 64


#### **Classification of pairs**

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device type:', device)
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('GPU is:', torch.cuda.get_device_name(0))

Device type: cuda
There are 1 GPU(s) available.
GPU is: NVIDIA GeForce RTX 4050 Laptop GPU


In [13]:
model_path = '/home/ssever/contradiction-detection/model/roberta-base.pt'
#model_path = '/home/user123/contradictory-information/model/xlm-roberta-base-snli-mnli-anli-xnli.pt'
#model_path = '/home/user123/contradictory-information/model/roberta-large.pt'

tokenizer_path = 'roberta-base'
#tokenizer_path = 'symanto/xlm-roberta-base-snli-mnli-anli-xnli'
#tokenizer_path = 'roberta-large'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = torch.load(model_path)
model.to(device)

MAX_LENGTH = 256

# Function for encoding input data
def encode_sets(data, tokenizer):
    kwargs = { 'truncation': True,
    'max_length': MAX_LENGTH,
    'padding': 'max_length',
    'return_attention_mask': True, 
    'return_token_type_ids': True     
    }
    #datalist = list(zip(data['premise'], data['hypothesis']))
    tokenized = tokenizer.batch_encode_plus(data,**kwargs)
    input_ids = torch.LongTensor(tokenized.input_ids)
    attention_masks = torch.LongTensor(tokenized.attention_mask)
    token_type_ids = torch.LongTensor(tokenized.token_type_ids)
    return input_ids, attention_masks, token_type_ids


# Encode similar pairs
input_ids, attention_masks, token_type_ids = encode_sets(similar_pairs, tokenizer)
pairs_tensor = TensorDataset(input_ids, attention_masks, token_type_ids)
pairs_dataloader = DataLoader(pairs_tensor, sampler=SequentialSampler(pairs_tensor), batch_size=len(similar_pairs))

model.eval()

for batch in pairs_dataloader:

    with torch.no_grad():
        input_ids, attention_masks, token_type_ids = (batch[0].to(device), 
                                                      batch[1].to(device),
                                                      batch[2].to(device))
        start_time = time.time()
        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_masks)
        end_time = time.time()

    logits = outputs.logits
    model_preds = logits.detach().cpu().numpy()

    # Calculate inference time
    inference_time = end_time - start_time
    print(f"\nInference time: {inference_time:.4f} seconds\n")
    predictions = np.argmax(model_preds, axis=1)

# target names according to 3 label or 11 label model
if model_path == '/home/ssever/contradiction-detection/model/xlm-roberta-base-snli-mnli-anli-xnli.pt':
    target_names = [
    "entailment",
    "neutral",
    "contradiction"
]
else:
    target_names = [
    "antonymity",
    "entailment",
    "factive_antonymity",
    "factive_embedding_verb",
    "lexical",
    "negation",
    "neutral",
    "numeric",
    "structure",
    "temporal",
    "worldknowledge"
]
    
# Use Counter to count occurrences
count = {target_names[key]: value for key, value in Counter(predictions).items()}

# Printing the count of each number
print(f"Predictions:\n{count}")

# count all predicted labels
total_count = sum(count.values())

# check whether label entailment has been predicted at least once and count the amount
if 'entailment' in count.keys():
    entailment_share = (count["entailment"] / total_count) * 100
else:
    entailment_share = 0

# check whether label neutral has been predicted at least once and count the amount
if 'neutral' in count.keys():
    neutral_share = (count["neutral"] / total_count) * 100
else:
    neutral_share = 0

# check whether a contradiction label has been predicted at least once count the amount
keys = {'entailment', 'neutral'}
contradiction_keys = set(count.keys()) - keys
if contradiction_keys:
    contradiction_labels =  [label for label in count if label not in ["entailment", "neutral"]]
    contradiction_count = sum(count[label] for label in contradiction_labels)
    contradiction_share = (contradiction_count / total_count) * 100
else:
    contradiction_share = 0

print(f"\nDistribution:\nEntailment: {entailment_share:.2f}%, Neutral: {neutral_share:.2f}%, Contradictions: {contradiction_share:.2f}%")


Inference time: 0.2118 seconds

Predictions:
{'entailment': 25, 'numeric': 18, 'temporal': 7, 'worldknowledge': 3, 'antonymity': 4, 'lexical': 1, 'structure': 6}

Distribution:
Entailment: 39.06%, Neutral: 0.00%, Contradictions: 60.94%


In [None]:
# calculate probabilty for each predicted class
probabilities = softmax(logits, dim=1)
#probabilities = [[round(max(value), 5), target_names[value.index(max(value))]] for value in probabilities.tolist()]
probabilities = [round(max(value), 2) for value in probabilities.tolist()]

# join pairs with label and probability
labels = [target_names[number] for number in predictions]
classifications = list(zip(similar_pairs, labels))
list(zip(classifications, probabilities))
#contradictions = [[[probability, label], premise, hypothesis] for (((premise, hypothesis), label), probability) in list(zip(classifications, probabilities)) if label != 'entailment' and label != 'neutral']
classifications = [[label, premise, hypothesis] for (((premise, hypothesis), label), probability) in list(zip(classifications, probabilities))]
for number, pair in enumerate(classifications):
    print(number, pair)

In [None]:
manual_contradictions = [[yes'The KION Group started the financial year 2024 with revenue growth and improved profitability.', 'The KION Group just finished the financial year 2024 and started 2025 with revenue growth and improved profitability.', 'temporal'],
                         [yes'Group revenue in the first quarter of 2024 grew by 2.8 percent to € 2.859 billion yearon-year (Q1 2023: € 2.781 billion).','Group revenue in the first quarter of 2024 grew by 5.4 percent to € 2.859 billion yearon-year (Q1 2023: € 2.781 billion).', 'numeric'],
                         [yes'In the Industrial Trucks & Services segment, revenue increased by 7.4 percent to € 2.153 billion (Q1 2023: € 2.005 billion), mainly due to the positive geographic and product mix as well as higher production output and sales prices. The service business also grew.', 'In the Industrial Trucks & Services segment, revenue decreased by 7.4 percent to € 2.153 billion (Q1 2023: € 2.005 billion), mainly due to the negative geographic and product mix as well as higher production output and sales prices.', 'antonymity'],
                         [no'The Supply Chain Solutions segment benefited from increased demand from pure ecommerce providers, general merchandise, and food retailers, but order intake in the project business remained subdued in the first three months impacted by customers’ hesitancy to sign new contracts due to macroeconomic uncertainties.', 'The Supply Chain Solutions segment did not benefit from increased demand from pure e-commerce providers, general merchandise, and food retailers, but order intake in the project business remained subdued in the first three months impacted by customers’ hesitancy to sign new contracts due to macroeconomic uncertainties.','negation'],
                         [yes'This was mainly due to the continued stability of material purchase prices, increased productivity as a result of improved material availability and revenue growth.', 'This was to a small degree due to the continued stability of material purchase prices, increased productivity as a result of improved material availability and revenue growth.', 'factive_antonymity'],
                         [yes'The KION Group is one of the world’s leading providers of industrial trucks and supply chain solutions.','The KION Group has removed industrial trucks from its product portfolio and focuses on supply chain solutions.', 'worldknowledge'],
                         [yes'Based on revenue for the year 2022, the KION Group is the leading overseas manufacturer in China, and including domestic manufacturers, the third-largest supplier there.', 'Based on revenue for the year 2022, the KION Group is the leading overseas manufacturer in China, and including domestic manufacturers, the second-largest supplier there.','structure'],
                         [yes'The KION Group is also one of the world’s leading warehouse automation providers, based on 2022 revenue.','The KION Group has lost its leading position as a warehouse automation provider, based on 2022 revenue.','factive_embedding_verb'],
                         [yes'At the end of 2023, more than 1.8 million industrial trucks of the KION Group were in use by customers from all manner of sectors and of varying sizes on six continents.','At the end of 2023, more than 1.8 million industrial trucks of the KION Group were in use by customers from a small number of sectors and of same sizes on six continents.','lexical'],
                         ]

reverse = [[hypothesis, label] for [premise, hypothesis, label] in manual_contradictions]

In [None]:
reverse

In [None]:
my_contradictions = [['temporal', '\uf0a7 Adj. EBIT  margin of  7.9 percent  (Q1 2023: 5.6 percent)  \uf0a7 Positive  free cash flow of € 65.7 million  (Q1 2023: € 104.9 million ) \uf0a7 Full year 2024 outlook confirmed   Frankfurt am Main, April 25th,  2024  – The KION Group started the financial year 2024 with revenue  growth and improved  profitability. The adjusted EBIT margin of 7.9', '▪ Adj. EBIT  improves  to € 226.7  million  (Q1 2023:  € 156.0  million)  ▪ Adj. EBIT margin  of 7.9 percent  (Q1 2023:  5.6 percent)  ▪ Positive  free cash  flow of € 65.7 million  (Q1 2023:  € 104.9  million)  ▪ Full year 2024  outlook  confirmed   Frankfurt am Main, April 25th, 2024 – The KION Group just finished  the financial year'],

['structure', 'Group revenue in the first quarter of 2024 grew by 2.8 percent to € 2.859 billion year -on-year (Q1 2023: € 2.781 billion). In the Industrial Trucks & Services  segment, revenue increased by 7.4 percent  to € 2.153 billion ( Q1 2023: € 2.005 billion), mainly due to the positive geographic and product mix as well as  higher  production output  and', 'a strong  foundation  to deliver  our full year  guidance."  Group revenue in the first quarter of 2024 grew by 5.4 percent  to € 2.859 billion year - on-year ( Q1 2023: € 2.781 billion).  In the Industrial Trucks & Services segment,  revenue decreased  by 7.4 percent to € 2.153 billion ( Q1 2023: € 2.005 billion), mainly'],

['structure', 'Group revenue in the first quarter of 2024 grew by 2.8 percent to € 2.859 billion year -on-year (Q1 2023: € 2.781 billion). In the Industrial Trucks & Services  segment, revenue increased by 7.4 percent  to € 2.153 billion ( Q1 2023: € 2.005 billion), mainly due to the positive geographic and product mix as well as  higher  production output  and', 'a strong  foundation  to deliver  our full year  guidance."  Group revenue in the first quarter of 2024 grew by 5.4 percent  to € 2.859 billion year - on-year ( Q1 2023: € 2.781 billion).  In the Industrial Trucks & Services segment,  revenue decreased  by 7.4 percent to € 2.153 billion ( Q1 2023: € 2.005 billion), mainly'],

['negation', 'the lower  project business order intake of the previous  quarters.   The Supply Chain Solutions segment benefited from increased  demand from pure e -commerce providers, general merchandise,  and food retailers, but order intake in the project business  remained subdued in the first three months impacted by customers’', 'Solutions  segment  declined by 8.1 percent to € 718.9 million ( Q1 2023: € 782.5 million) due to  the lower  project  business  order  intake  of the previous  quarters.  The Supply Chain Solutions segment did not benefit  from increased demand from pure e - commerce providers, general merchandise, and food retailers, but order'],

['entailment', 'Page 2 development of the previous quarters and achieved a double- digit adjusted EBIT margin of 11.1 percent  (Q1 2023: 8.8  percent ) with an adjusted EBIT of € 239.7 million (Q1 2023: € 176.6 million). This was mainly due to the continued stability of material purchase prices, increased productivity as a result of improved  material availability and revenue growth.', 'Page  2   development of the previous quarters and achieved a double -digit adjusted EBIT  margin  of 11.1 percent  (Q1 2023:  8.8 percent)  with an adjusted  EBIT  of € 239.7  million  (Q1 2023: € 176.6 million). This was  to a small degree  due to the continued stability of material  purchase prices, increased productivity as a result of improved material'],

['entailment', 'Free Cash Flow  715.2   550 – 670  –  –  –  – ROCE   7.7%   7.4% – 8.8%   –  –  –  –  1 Disclosures for the Industrial Trucks & Services and Supply Chain Solutions segments also include intra -group cross -segment revenue and effects on EBIT.     The Company  The KION Group is one of the world’s leading providers of industrial trucks and supply', 'Free Cash Flow  ROCE   KION  Group  Industrial Trucks  & Services  Supply Chain  Solutions  1 Disclosures for the Industrial Trucks & Services and Supply Chain Solutions segments also include intra -group cross -segment revenue and  effects  on EBIT.          The Company   The KION Group has removed industrial trucks from its product portfolio and focuses'],

['antonymity', 'the KION Group is the leading overseas manufacturer in China, and including domestic manufacturers, the third- largest supplier there. The KION Group is also one of the world’s leading warehouse automation providers, based on 2022  revenue. At the end of 2023, more than 1. 8 million industrial trucks of the KION Group were in', 'the KION Group is the leading overseas manufacturer in China, and including domestic  manufacturers, the second -largest  supplier there. The KION Group has lost its leading position as a warehouse automation provider, based on 2022 revenue.  At the end of 2023, more than 1.8 million industrial trucks of the KION Group were in'],

['antonymity', 'the KION Group is the leading overseas manufacturer in China, and including domestic manufacturers, the third- largest supplier there. The KION Group is also one of the world’s leading warehouse automation providers, based on 2022  revenue. At the end of 2023, more than 1. 8 million industrial trucks of the KION Group were in', 'the KION Group is the leading overseas manufacturer in China, and including domestic  manufacturers, the second -largest  supplier there. The KION Group has lost its leading position as a warehouse automation provider, based on 2022 revenue.  At the end of 2023, more than 1.8 million industrial trucks of the KION Group were in'],

['entailment', 'At the end of 2023, more than 1. 8 million industrial trucks of the KION Group were in use by customers from all manner of sectors and of varying sizes on six continents. The group currently has more than  42,000 employees and generated revenue of approx. € 11.4 billion in the 2023 financial year.', 'At the end of 2023, more than 1.8 million industrial trucks of the KION Group were in  use by customers from a  small number  of sectors and of same  sizes on six continents.  The group currently has more than 42,000 employees and generated revenue of  approx.  € 11.4 billion  in the 2023  financial  year.']
]

In [None]:
my_contradictions = [[premise, hypothesis, label] for [label, premise, hypothesis] in my_contradictions]

#### **Store results in persistent variable**

In [None]:
langchain_chunks = my_contradictions
%store langchain_chunks

#### **Evaluation graph**

In [None]:
# Data
categories = ['A', 'B', 'C', 'D', 'E']
values = [6, 2, 0, 6, 5]

# Creating the bar diagram
plt.figure(figsize=(6, 3))
plt.bar(categories, values, color='blue')
plt.title('Bar Diagram')
plt.xlabel('Labels')
plt.ylabel('Values')

# Show the plot
plt.show()