In [30]:
import pandas as pd
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from openai import OpenAI
from eval_veracity_prediction import print_evaluation_results
from transformers import AutoTokenizer, AutoModel
import torch

In [31]:
with open("../openAI_apikey.json", "r") as fp:
    openai_api_key = json.load(fp)["openAI_api_key"]
client = OpenAI(api_key=openai_api_key)

#### Read and preprocess quantemp

In [32]:
with open("../data/quantemp/test_claims_quantemp.json", "r") as fp:
    data = json.load(fp)
data = pd.DataFrame(data)
data = data[["query_id", "claim", "taxonomy_label", "doc",  "label"]]

In [33]:
def generate_embeddings(data, embedding_model, with_taxonomy = False, with_evidence = False, use_openai = False,):
    embedded_list = []
    label_list = []
    for idx in tqdm(range(len(data))):
        sample = data.iloc[idx]
        if with_taxonomy == True and with_evidence == True: 
            sentence = f"""
                        [Claim]: {sample["claim"]}
                        [Taxonomy]: {sample["taxonomy_label"]}
                        [Evidence]: {sample["doc"]}
                        """
        elif with_taxonomy == True: 
            sentence = f"""
                        [Claim]: {sample["claim"]}
                        [Taxonomy]: {sample["taxonomy_label"]}
                        """
        elif with_evidence == True: 
            sentence = f"""
                        [Claim]: {sample["claim"]}
                        [Evidence]: {sample["doc"]}
                        """
        else:
            sentence = f"""
                        [Claim]: {sample["claim"]}
                        """

        sentence = sentence.replace("\n", " ")
        if use_openai == True: 
            embedded_claim = client.embeddings.create(input = [sentence], model="text-embedding-3-small", dimensions = 512 ).data[0].embedding
        elif use_openai == False:  
            embedded_claim = embedding_model.encode(sentence)
        label_list.append(sample["label"])
        embedded_list.append(np.array(embedded_claim))
    
    return embedded_list, label_list

In [34]:
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

In [35]:
data.head()

Unnamed: 0,query_id,claim,taxonomy_label,doc,label
0,0,"""The non-partisan Congressional Budget Office ...",statistical,Republican U.S House candidate Roger Williams ...,Conflicting
1,1,"""More than 50 percent of immigrants from (El S...",statistical,The crisis at the border brought on by thousan...,True
2,2,UK government banned Covid vaccine for childre...,temporal,"""BREAKING: Children in the UK aged 5-11 will n...",False
3,3,"""[In 2014-2015] coverage for the rotavirus vac...",statistical,In its annual performance plan the Free State ...,False
4,4,"In September 2021, the U.K. government announc...",temporal,"In September 2021, several news outlets and we...",True


In [36]:
embedded_list, label_list = generate_embeddings(data, embedding_model, with_taxonomy=False, with_evidence=False, use_openai=True)
# embedded_list, label_list = np.load("../data/openAI_embedding/small_taxonomy.npy"), np.load("../data/openAI_embedding/label_list.npy")
taxonomy_list = data["taxonomy_label"].to_list()
embedded_list = np.array(embedded_list)
print(f"Embedded list shape: {embedded_list.shape}")
tsne = TSNE(n_components=2, random_state=42)
pca = PCA(n_components=2, random_state=42)
tnse_embeddings = tsne.fit_transform(embedded_list)
pca_embeddings = pca.fit_transform(embedded_list)

100%|██████████| 2495/2495 [14:08<00:00,  2.94it/s]


Embedded list shape: (2495, 512)


In [37]:
np.save("../data/openAI_embedding/small_claim.npy", embedded_list)
# np.save("../data/openAI_embedding/label_list.npy", label_list)

In [50]:
# def visualize_results(label_list, taxonomy_list, reduced_embeddings, embedding_type = "t-SNE"):

#     class_names = ["True", "False", "Conflicting"]  # Unique class names in the same order

#     # Define a color mapping for each class
#     color_mapping = {"True": "green", "False": "red", "Conflicting": "blue"}

#     # Create the plot
#     plt.figure(figsize=(8, 6))

#     for taxonomy_name in set(taxonomy_list):

#         # Plot each class with its respective color
#         for class_name in class_names:
#             indices = [i for i, label in enumerate(label_list) if (label == class_name and taxonomy_list[i] == taxonomy_name)]
#             plt.scatter(
#                 [reduced_embeddings[i, 0] for i in indices], 
#                 [reduced_embeddings[i, 1] for i in indices], 
#                 label=class_name,
#                 color=color_mapping[class_name]
#             )

#         # Add title and labels
#         plt.title(f"Sentence Embeddings Visualized Using {embedding_type}")
#         plt.xlabel(f"{embedding_type} Dimension 1")
#         plt.ylabel(f"{embedding_type} Dimension 2")
#         plt.legend()
#         plt.savefig(f"{embedding_type}_{taxonomy_name}.png")
#         plt.grid(True)
#         plt.close()

In [53]:
def visualize_results(label_list, taxonomy_list, reduced_embeddings, embedding_type="t-SNE", plot_type="both"):
    """
    Visualize embeddings with color for class_name and/or marker for taxonomy_name.
    
    Parameters:
    - label_list: List of class names corresponding to embeddings.
    - taxonomy_list: List of taxonomy names corresponding to embeddings.
    - reduced_embeddings: 2D numpy array or list of reduced embeddings (x, y).
    - embedding_type: String indicating embedding method (e.g., "t-SNE").
    - plot_type: String indicating the plot type: "class", "taxonomy", or "both".
    """

    # Define unique class names and taxonomy names
    class_names = ["True", "False", "Conflicting"]
    taxonomy_names = list(set(taxonomy_list))
    
    # Define a color mapping for class_name
    color_mapping_class = {"True": "green", "False": "red", "Conflicting": "blue"}
    marker_mapping_class = {"True": "o", "False": "x", "Conflicting": "s"}
    
    # Define a color and marker mapping for taxonomy_name
    color_mapping_taxonomy = {
        name: color for name, color in zip(taxonomy_names, ['red', 'blue', 'green', 'purple'])
    }
    
    # Create the plot
    plt.figure(figsize=(10, 8))
    
    if plot_type == "class":
        # Plot only by class_name (color-coded)
        for class_name in class_names:
            indices = [i for i, label in enumerate(label_list) if label == class_name]
            plt.scatter(
                [reduced_embeddings[i, 0] for i in indices],
                [reduced_embeddings[i, 1] for i in indices],
                color=color_mapping_class[class_name],
                label=class_name,
                marker=marker_mapping_class[class_name]
            )
    
    elif plot_type == "taxonomy":
        # Plot only by taxonomy_name (color-coded and marker-coded)
        for taxonomy_name in taxonomy_names:
            indices = [i for i, taxonomy in enumerate(taxonomy_list) if taxonomy == taxonomy_name]
            plt.scatter(
                [reduced_embeddings[i, 0] for i in indices],
                [reduced_embeddings[i, 1] for i in indices],
                color=color_mapping_taxonomy[taxonomy_name],
                label=taxonomy_name
            )
    
    elif plot_type == "both":
        # Plot by both class_name (color-coded) and taxonomy_name (marker-coded)
        for taxonomy_name in taxonomy_names:
            for class_name in class_names:
                indices = [
                    i for i, label in enumerate(label_list) 
                    if label == class_name and taxonomy_list[i] == taxonomy_name
                ]
                facecolors = "none" if marker_mapping_class[class_name] == "s" else color_mapping_taxonomy[taxonomy_name]
                plt.scatter(
                    [reduced_embeddings[i, 0] for i in indices],
                    [reduced_embeddings[i, 1] for i in indices],
                    edgecolors=color_mapping_taxonomy[taxonomy_name],
                    marker=marker_mapping_class[class_name],
                    facecolors=facecolors,
                    label=f"{taxonomy_name} - {class_name}"
                )
    
    else:
        raise ValueError("Invalid plot_type. Choose from 'class', 'taxonomy', or 'both'.")
    
    # Add title and labels
    plt.title(f"Sentence Embeddings Visualized Using {embedding_type}")
    plt.xlabel(f"{embedding_type} Dimension 1")
    plt.ylabel(f"{embedding_type} Dimension 2")
    plt.legend(loc="best", fontsize='small', bbox_to_anchor=(1.05, 1))
    plt.grid(True)
    plt.tight_layout()
    
    # Save and show the plot
    plt.savefig(f"{embedding_type}_{plot_type}_visualization.png")
    plt.close()


#### Visualize the TSNE results

In [51]:
visualize_results(label_list, taxonomy_list, tnse_embeddings, embedding_type="t-SNE")

#### Try it with PCA algorithm

In [54]:
visualize_results(label_list, taxonomy_list, pca_embeddings, embedding_type="PCA")

  plt.scatter(
  plt.scatter(
  plt.scatter(
  plt.scatter(


#### Try training a classifier on top of them without dimensionality reduction

In [12]:
X, y = embedded_list, np.array(label_list)
# enc = OneHotEncoder(sparse_output = False)
# y_transformed = enc.fit_transform(y.reshape(y.shape[0], 1))

In [13]:
X_train, X_test, y_train, y_test = train_test_split(embedded_list, y, test_size=0.2)
print(f"Shape of X_train, X_test, y_train, and y_test are: {X_train.shape}, {X_test.shape}, {y_train.shape}, {y_test.shape}")

Shape of X_train, X_test, y_train, and y_test are: (1996, 512), (499, 512), (1996,), (499,)


In [14]:
classfier = RandomForestClassifier()
classfier.fit(X_train, y_train)
y_pred = classfier.predict(X_test)
print_evaluation_results(y_pred, y_test)

              precision    recall  f1-score   support

       false     0.6534    0.9394    0.7707       297
        true     0.3529    0.0714    0.1188        84
 conflicting     0.3818    0.1780    0.2428       118

    accuracy                         0.6132       499
   macro avg     0.4627    0.3963    0.3774       499
weighted avg     0.5386    0.6132    0.5361       499

[[279   2  16]
 [ 60   6  18]
 [ 88   9  21]]



In [15]:
classfier = XGBClassifier()
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
classfier.fit(X_train, y_train_encoded)
y_pred = classfier.predict(X_test)
y_pred_inv = le.inverse_transform(y_pred)
print_evaluation_results(y_pred_inv, y_test)

              precision    recall  f1-score   support

       false     0.6929    0.8889    0.7788       297
        true     0.3704    0.2381    0.2899        84
 conflicting     0.4219    0.2288    0.2967       118

    accuracy                         0.6232       499
   macro avg     0.4951    0.4519    0.4551       499
weighted avg     0.5745    0.6232    0.5825       499

[[264  15  18]
 [ 45  20  19]
 [ 72  19  27]]



#### Exploratory analysis of dependency parser. 
We hypothesize that dependency parsing might be a good candidate to find correlation between sentence complexity and impact of decomposition. 

In [10]:
import spacy
from spacy import displacy

In [None]:
idx = 0
sample = data.iloc[idx]
print(sample["claim"])

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(str(sample["claim"]))

In [None]:
plt.figure(figsize = (10,7))
displacy.render(doc, style = "dep", options = {"compact": True, "distance": 90})

In [17]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
tagger = SequenceTagger.load("ner")  # Loads a pretrained NER model

# Create a sentence
sentence = Sentence(sample["claim"])

# Predict entities (spans)
tagger.predict(sentence)

# Print detected spans
for entity in sentence.get_spans('ner'):
    print(f"Span: {entity.text}, Label: {entity.tag}")

### Extract latent embeddings of the queries and training data query of different complexities from different layers of the LM

In [27]:
# Replace with the specific LLaMA model you are using
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_hidden_states=True)  # Enable hidden states

Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.77it/s]


In [28]:
input_text = ["The day the earth stood still.", "I am a good boy.", "Hello darkness my old friend. I have come to talk to you again",
              ]
# Tokenize the batch (pad to the longest sequence and create tensors)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(input_text, padding=True,  return_tensors="pt")


# Forward pass
with torch.no_grad():  # No gradient computation required
    outputs = model(**inputs)

# Extract hidden states
hidden_states = outputs.hidden_states  # A tuple containing hidden states for each layer

In [29]:
hidden_states = torch.stack(hidden_states)
print(hidden_states.shape)

torch.Size([33, 3, 15, 4096])
