In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# import category types
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '/home/ssever/ContraDoc/src/data_generation')
from category_types import *

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device type:', device)
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('GPU is:', torch.cuda.get_device_name(0))

In [None]:
# Huggingface authetication for llama3 download

#from huggingface_hub import login
#login()

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map=device,
)

In [None]:
#%store -r langchain_chunks
#%store -r langchain_sentences
#%store -r custom_chunks
%store -r custom_sentences

target_names = [
    "antonym",
    "entailment",
    "factive_antonym",
    "factive_embedding_verb",
    "lexical",
    "negation",
    "neutral",
    "numeric",
    "structure",
    "temporal",
    "worldknowledge"
]

labels = {}

labels[target_names[0]] = antonym.description
labels[target_names[1]] = entailment.description
labels[target_names[2]] = factive_antonym.description
labels[target_names[3]] = factive_embedded_verb.description
labels[target_names[4]] = lexical.description
labels[target_names[5]] = negation.description
labels[target_names[6]] = neutral.description
labels[target_names[7]] = numeric.description
labels[target_names[8]] = structure.description
labels[target_names[9]] = temporal.description
labels[target_names[10]] = wk.description


#statement_pair = langchain_chunks
#statement_pair = langchain_sentences
#statement_pair = custom_chunks
statement_pair = custom_sentences

predictions = []

for pair in statement_pair:

    label_description = labels[pair[2]] 
    
    messages = [
        {"role": "system", "content": "You are an expert on semantics and linguistics, with a profound knowledge\
        in Natural Language Processing. You are especially aware of the work by Marneffe et al., classifying\
        different types of contradictions, such as antonyms, negations, numerics, factive, structural, lexical, and world knowledge contradictions. To this end,\
        a contradiction is defined as a mismatch between two statements, such that they cannot possibly both be true. Furthermore, you are aware of the work of classifying entailments and\
        neutral pairs of statements. To this end, an entailment is defined in that two statements are entailed if the truth of the second statement follows from the truth of the first statement.\
        Statements of neutral pairs do neither entail nor contradict each other.\
        It is assumed, that both statements refer to the same fact or event, even if this is not explicitly stated. Premise, Hypothesis and the classification are provided in the following\
        format ['Premise', 'Hypothesis', 'classification'].\
        You have to check if the pair of statements is labeled with the correct classifcation, that is with the correct contradiction type or correct entailment or correct neutral.\
         A more detailed description of the current classification \
        is given as {label_description}. Furthermore, Revenue, EBIT, EBIT margin and EBITDA are different types of financial measurements, which is why sentence pairs that\
        have different values but use different measures generally don't contradict each other. If you see different years in Premise and Hypothesis, especially within brackets \
        that means that the information in Premise refers to a different time than the one in the Hypothesis, thus not a contradiction. If either Premise or Hypothesis contains only something\
        like \uf0a7 Adj. then that is a neutral pair".format(label_description=label_description)},
        {"role": "user", "content": "Check the statement pair: {statement_pair} and output the correct classification. Pairs which are duplicates are to be seen as entailments.\
         If it is a contradiction, only give the contradiction type. Don't add the word contradiction after the type.\
        Give no explanation, only the classification in its pure string form, don't add brackets or anything else to the string.".format(statement_pair=pair)},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)

    attention_mask = torch.ones_like(input_ids).to(device)

    # Compile the model
    #compiled_model = torch.compile(model)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=512,
        eos_token_id=terminators,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.01,
        top_p=0.5,
    )

    response = outputs[0][input_ids.shape[-1]:]
    decoded_repsonse = tokenizer.decode(response, skip_special_tokens=True)
    #print(decoded_repsonse)

    predictions.append(decoded_repsonse)

In [None]:
new_pairs = [[premise, hypothesis] for premise, hypothesis, pred in statement_pair]
classifications = list(zip(new_pairs, predictions))
classifications = [[premise, hypothesis, label.lower()] for ([premise, hypothesis], label) in classifications]
#entailment_neutral = [[premise, hypothesis, label.lower()] for ([premise, hypothesis], label) in classifications if label.lower() == 'entailment' or label.lower() == 'neutral']
#len(entailment_neutral)
#contradictions = [[premise, hypothesis, label.lower()] for ([premise, hypothesis], label) in classifications if label.lower() != 'entailment' and label.lower() != 'neutral']
#len(contradictions)
classifications

### **Backup**

In [None]:
# Re-classification Prompt
messages = [
        {"role": "system", "content": "You are an expert on semantics and linguistics, with a profound knowledge\
        in Natural Language Processing. You are especially aware of the work by Marneffe et al., classifying\
        different types of contradictions, such as antonyms, negations, numerics, factive, structural, lexical, and world knowledge contradictions. To this end,\
        a contradiction is defined as a mismatch between two statements, such that they cannot possibly both be true. Furthermore, you are aware of the work of classifying entailments and\
        neutral pairs of statements. To this end, an entailment is defined in that two statements are entailed if the truth of the second statement follows from the truth of the first statement.\
        Statements of neutral pairs do neither entail nor contradict each other.\
        It is assumed, that both statements refer to the same fact or event, even if this is not explicitly stated. Premise, Hypothesis and the classification are provided in the following\
        format ['Premise', 'Hypothesis', 'classification'].\
        You have to check if the pair of statements is labeled with the correct classifcation, that is with the correct contradiction type or correct entailment or correct neutral.\
         A more detailed description of the current classification \
        is given as {label_description}. Furthermore, Revenue, EBIT, EBIT margin and EBITDA are different types of financial measurements, which is why sentence pairs that\
        have different values but use different measures generally don't contradict each other. If you see different years in Premise and Hypothesis, especially within brackets \
        that means that the information in Premise refers to a different time than the one in the Hypothesis, thus not a contradiction. If either Premise or Hypothesis contains only something\
        like \uf0a7 Adj. then that is a neutral pair".format(label_description=label_description)},
        {"role": "user", "content": "Check the statement pair: {statement_pair} and output the correct classification. Pairs which are duplicates are to be seen as entailments.\
         If it is a contradiction, only give the contradiction type. Don't add the word contradiction after the type.\
        Give no explanation, only the classification in its pure string form, don't add brackets or anything else to the string.".format(statement_pair=pair)},
    ]


In [None]:
# Classification prompt
messages = [
        {"role": "system", "content": "You are an expert on semantics and linguistics, with a profound knowledge\
        in Natural Language Processing. You are especially aware of the work by Marneffe et al., classifying\
        different types of contradictions, such as antonyms, negations, numerics, factive, structural, lexical, and world knowledge contradictions. To this end,\
        a contradiction is defined as a mismatch between two statements, such that they cannot possibly both be true. Furthermore, you are aware of the work of classifying entailments and\
        neutral pairs of statements. To this end, an entailment is defined in that two statements are entailed if the truth of the second statement follows from the truth of the first statement.\
        Statements of neutral pairs do neither entail nor contradict each other.\
        It is assumed, that both statements refer to the same fact or event, even if this is not explicitly stated. Premise and Hypothesis are provided in the following\
        format ['Premise', 'Hypothesis'].\
        You have to determine the correct label, that is the correct contradiction type or correct entailment or correct neutral.\
        Furthermore, Revenue, EBIT, EBIT margin and EBITDA are different types of financial measurements, which is why sentence pairs that\
        have different values but use different measures generally don't contradict each other. If you see different years in Premise and Hypothesis, especially within brackets \
        that means that the information in Premise refers to a different time than the one in the Hypothesis, thus not a contradiction. If either Premise or Hypothesis contains only something\
        like \uf0a7 Adj. then that is a neutral pair"},
        {"role": "user", "content": "Check the statement pair: {statement_pair} and output the correct classification. Pairs which are duplicates are to be seen as entailments.\
         If it is a contradiction, only give the contradiction type. Don't add the word contradiction after the type.\
        Give no explanation, only the classification in its pure string form, don't add brackets or anything else to the string.".format(statement_pair=pair)},
    ]