#### Importing necessary libraries

In [22]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
import torch
import faiss
import ast
from transformers import BertTokenizer, BertModel
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity



### Functions for RAG

In [110]:


# # Example of a free text corpus
# corpus_text = """
# Audi presens can help prepare for and in some cases help prevent collisions.Audi's progress is amplifying your instincts. 
# Get an exceptional offer at your local Audi dealer. As a man speeds down a country road in his Audi Q7, its pre-sense safety technology responds to a deer leaping suddenly in front of his car. 
# Without missing a beat, the vehicle skids to a stop before the hair on the startled man's arms has time to rise. The Audi pre-sense is now available in the Audi A4.
# """

def split_text_into_chunks(text, chunk_size=80):
    """
    Splits the input text into smaller chunks of specified size.

    Args:
    text (str): The text to be split into chunks.
    chunk_size (int, optional): The size of each chunk. Defaults to 80.

    Returns:
    list: A list containing the text chunks.
    
    This function takes a string `text` and splits it into chunks of size `chunk_size`.
    It iterates through the text, appending each chunk of the specified size to the 
    `chunks` list. The result is a list of text chunks, where each chunk is at most 
    `chunk_size` characters long.
    """
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

def generate_embeddings(chunks):
    """
    Generates embeddings for each chunk of text using a Sentence-BERT model.

    Args:
    chunks (list): List of strings where each string represents a chunk of text.

    Returns:
    numpy.ndarray: 2D array of embeddings where each row corresponds to the embedding of a chunk.

    This function initializes a Sentence-BERT model ('paraphrase-MiniLM-L6-v2')
    and computes embeddings for each chunk of text in the input list `chunks`. The embeddings
    are computed using the Sentence-BERT model and stored in a list. These embeddings are then
    stacked vertically to form a 2D numpy array. Each row in the array represents the embedding
    vector of a chunk of text.
    """
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    # Generate embeddings for each chunk
    embeddings = []
    for chunk in chunks:
        chunk_embedding = model.encode(chunk)
        embeddings.append(chunk_embedding)
    
    # Stack embeddings into a numpy array
    embeddings = np.vstack(embeddings)
    return embeddings

def build_faiss_index(embeddings, dimension=384):
    """
    Builds a FAISS index for fast similarity search using the given embeddings.

    Args:
    embeddings (numpy.ndarray): 2D array of embeddings where each row represents the embedding vector of a chunk of text.
    dimension (int, optional): Dimensionality of the embedding vectors. Default is 384.

    Returns:
    faiss.IndexFlatL2: FAISS index object configured with L2 distance metric, populated with the provided embeddings.

    This function initializes a FAISS index with the specified dimensionality and adds the given embeddings to it.
    The FAISS index is optimized for efficient similarity search based on L2 distance metric. It returns the initialized
    FAISS index object ready for use in similarity searches.
    """
    # Build FAISS index
    index = faiss.IndexFlatL2(dimension)  # L2 distance metric
    index.add(embeddings)
    return index

def retrieve_documents(user_question, index, chunks):
    """
    Retrieves relevant documents (chunks of text) based on the user question using a pre-built FAISS index.

    Args:
    user_question (str): User query for retrieving relevant documents.
    index (faiss.IndexFlatL2): Pre-built FAISS index object for similarity search.
    chunks (list): List of text chunks corresponding to the embeddings used for indexing.

    Returns:
    list: List of relevant text chunks retrieved based on the similarity search using FAISS.

    This function encodes the user question using a Sentence-BERT model, then performs a vector search using
    FAISS to retrieve the closest embeddings to the query. It returns the corresponding text chunks that are
    most relevant to the user question based on the similarity scores obtained from FAISS.
    """
    
    # Generate embedding for user question (simulated)
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    query_embedding = model.encode(user_question)
    # print('Query embedding:', query_embedding)
    # Perform vector search using FAISS
    query_embedding = query_embedding.astype(np.float32).reshape(1, -1)  # Convert to 2D array 
    # print('Query embedding:', query_embedding)  

    # print(f"Query embedding shape: {query_embedding.shape}")
    # print(f"Expected dimension (d): {index.d}")

    D, I = index.search(query_embedding, 5)  # Retrieving top 5 closest embeddings
    
    # Retrieve relevant chunks based on FAISS results
    relevant_docs = [chunks[i] for i in I.flatten()]
    # print('relevant docs:', relevant_docs)
    return relevant_docs

def semantic_search(user_question, relevant_docs):
    """
    Performs semantic search to determine if there are relevant documents (chunks of text) that match the user question.

    Args:
    user_question (str): User query to find relevant documents.
    relevant_docs (list): List of relevant text chunks retrieved from the FAISS index.

    Returns:
    str: "Yes" if there are relevant documents with cosine similarity above a threshold, otherwise "No".

    This function encodes the user question and relevant documents using a Sentence-BERT model,
    calculates cosine similarity scores, and determines if any relevant documents match the user question
    based on a predefined similarity threshold. It returns "Yes" if a relevant match is found, otherwise "No".
    """
    # Initialize Sentence-BERT model (example model)
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    
    # Encode user question
    query_embedding = model.encode(user_question, convert_to_tensor=True)
    # Encode relevant documents
    doc_embeddings = model.encode(relevant_docs, convert_to_tensor=True)
    
    # Calculate cosine similarity between user question and relevant documents
    cos_sim_scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)
    # print(cos_sim_scores)
    # Select documents with highest similarity scores
    threshold = 0.25  # Adjust as needed
    # filtered_docs = [relevant_docs[i] for i in range(len(relevant_docs)) if cos_sim_scores[0][i] > threshold]
    if cos_sim_scores.max() > 0.25:  # You can adjust this threshold as needed
        return "Yes"
    else:
        return "No"
    
    

In [111]:
df = pd.read_excel('data/long_form_data1.xlsx')
df.shape

(150, 5)

In [112]:
df['Labels'] = df['Labels'].apply(lambda x: ast.literal_eval(x))


In [113]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


# Predefined set of questions
questions = [
    "Is there a call to go online (e.g., shop online, visit the Web)?",
    "Is there online contact information provided (e.g., URL, website)?",
    "Is there a visual or verbal call to purchase (e.g., buy now, order now)?",
    "Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?",
    "Is there an incentive to buy (e.g., a discount, a coupon, a sale or 'limited time offer')?",
    "Is there offline contact information provided (e.g., phone, mail, store location)?",
    "Is there mention of something free?",
    "Does the ad mention at least one specific product or service (e.g., model, type, item)?",
    "Is there any verbal or visual mention of the price?",
    "Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?",
    "Does the ad show the brand or trademark exactly once at the end of the ad?",
    "Is the ad intended to affect the viewer emotionally, either with positive emotion or negative emotion?",
    "Does the ad give you a positive feeling about the brand?",
    "Does the ad have a story arc, with a beginning and an end?",
    "Does the ad have a reversal of fortune, where something changes for the better or worse?",
    "Does the ad have relatable characters?",
    "Is the ad creative/clever?",
    "Is the ad intended to be funny?",
    "Does this ad provide sensory stimulation?",
    "Is the ad visually pleasing?",
    "Does the ad have cute elements like animals, babies, animated characters, etc?"
]

question_embeddings = []
for question in questions:
    encoded_input = tokenizer(question, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
        embeddings = output.last_hidden_state.mean(dim=1).squeeze().numpy()
    question_embeddings.append(embeddings)
#( Need to remove embeddings from the list to save memory as not using this embeddings)
# Convert to numpy array for consistency
# query_embedding = np.array(question_embeddings)

In [114]:
# Function to process a single corpus
def process_corpus(corpus, questions):
    """
    Processes a corpus of text, performs semantic search for each question, and returns answers.

    Args:
    corpus (str): Full text corpus to process.
    question_embeddings (list): List of embeddings for each question.
    questions (list): List of questions to answer based on the corpus.

    Returns:
    dict: A dictionary where keys are questions and values are answers ("Yes" or "No").

    This function splits the corpus into text chunks, generates embeddings for each chunk,
    builds a FAISS index for efficient retrieval, and performs semantic search to answer
    each question based on the relevant chunks of text. It returns a dictionary of questions
    mapped to their corresponding answers.
    """
    chunks = split_text_into_chunks(corpus)
    embeddings = generate_embeddings(chunks)
    index = build_faiss_index(embeddings)
    results = {}

    for question in questions:
        relevant_docs = retrieve_documents(question, index, chunks)
        answer = semantic_search(question, relevant_docs)
        results[question] = answer

    return results

In [115]:
# query_embedding.reshape(1, -1).astype(np.float32)

In [116]:
# Initialize lists to store evaluation results
speech_list = []
predicted_labels_list = []
recall_list = []
precision_list = []
f1_score_list = []
agreement_percentage_list = []
accuracy_list = []
roc_auc_list = []

In [118]:
# Initializing variable to accumulate total agreements
total_agreement = 0

# Iterating over each row in the dataframe
for index, row in df.iterrows():
    user_input_text = row['Speech'] + row['description']
    labels = row['Labels']

    # Process the corpus to get answers for all questions
    answers = process_corpus(user_input_text, questions)
    print("Operation Done: ", index)
    # Convert answers to binary format for evaluation
    predicted_answers = [1 if answers[question] == "Yes" else 0 for question in questions]

    # Calculate evaluation metrics
    recall = recall_score(labels, predicted_answers, average='binary')
    precision = precision_score(labels, predicted_answers, average='binary')
    f1 = f1_score(labels, predicted_answers, average='binary')
    accuracy = accuracy_score(labels, predicted_answers)
    roc_auc = roc_auc_score(labels, predicted_answers)

    # Append results to lists
    speech_list.append(user_input_text)
    predicted_labels_list.append(predicted_answers)
    recall_list.append(recall)
    precision_list.append(precision)
    f1_score_list.append(f1)
    accuracy_list.append(accuracy)
    roc_auc_list.append(roc_auc)

    # Calculate agreement percentage
    agreement_count = sum([1 for true, pred in zip(labels, predicted_answers) if true == pred])
    total_agreement += agreement_count
    agreement_percentage = (agreement_count / len(labels)) * 100
    agreement_percentage_list.append(agreement_percentage)


Operation Done:  0
Operation Done:  1
Operation Done:  2
Operation Done:  3
Operation Done:  4
Operation Done:  5
Operation Done:  6
Operation Done:  7
Operation Done:  8


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Operation Done:  9
Operation Done:  10
Operation Done:  11
Operation Done:  12
Operation Done:  13
Operation Done:  14
Operation Done:  15
Operation Done:  16
Operation Done:  17
Operation Done:  18
Operation Done:  19
Operation Done:  20
Operation Done:  21
Operation Done:  22
Operation Done:  23
Operation Done:  24
Operation Done:  25
Operation Done:  26
Operation Done:  27
Operation Done:  28
Operation Done:  29
Operation Done:  30
Operation Done:  31
Operation Done:  32
Operation Done:  33
Operation Done:  34
Operation Done:  35
Operation Done:  36
Operation Done:  37
Operation Done:  38
Operation Done:  39
Operation Done:  40
Operation Done:  41
Operation Done:  42
Operation Done:  43
Operation Done:  44
Operation Done:  45
Operation Done:  46
Operation Done:  47
Operation Done:  48
Operation Done:  49
Operation Done:  50
Operation Done:  51
Operation Done:  52
Operation Done:  53
Operation Done:  54
Operation Done:  55
Operation Done:  56
Operation Done:  57
Operation Done:  58
O

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Operation Done:  83
Operation Done:  84
Operation Done:  85
Operation Done:  86
Operation Done:  87
Operation Done:  88
Operation Done:  89
Operation Done:  90
Operation Done:  91
Operation Done:  92
Operation Done:  93
Operation Done:  94
Operation Done:  95
Operation Done:  96
Operation Done:  97
Operation Done:  98
Operation Done:  99
Operation Done:  100
Operation Done:  101
Operation Done:  102
Operation Done:  103
Operation Done:  104
Operation Done:  105
Operation Done:  106
Operation Done:  107
Operation Done:  108
Operation Done:  109
Operation Done:  110
Operation Done:  111
Operation Done:  112
Operation Done:  113
Operation Done:  114
Operation Done:  115
Operation Done:  116
Operation Done:  117
Operation Done:  118
Operation Done:  119
Operation Done:  120
Operation Done:  121
Operation Done:  122
Operation Done:  123
Operation Done:  124
Operation Done:  125
Operation Done:  126
Operation Done:  127
Operation Done:  128
Operation Done:  129
Operation Done:  130
Operation

In [119]:
# Create a new dataframe with results
results_df = pd.DataFrame({
    'Speech': speech_list,
    'Predicted Labels': predicted_labels_list,
    'Recall': recall_list,
    'Precision': precision_list,
    'F1 Score': f1_score_list,
    'Accuracy': accuracy_list,
    'ROC AUC': roc_auc_list,
    'Agreement Percentage': agreement_percentage_list

})


In [122]:
results_df.head()

Unnamed: 0,Speech,Predicted Labels,Recall,Precision,F1 Score,Accuracy,ROC AUC,Agreement Percentage
0,It's another pure gray morning. Don't know wha...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0.0,0.0,0.0,0.47619,0.454545,47.619048
1,The end of civilization is upon us. Hold your ...,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",0.375,0.428571,0.4,0.571429,0.533654,57.142857
2,Audi presens can help prepare for and in some ...,"[0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, ...",0.6,0.6,0.6,0.619048,0.618182,61.904762
3,The new Honda Odyssey has tons of available sm...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.272727,0.6,0.375,0.52381,0.536364,52.380952
4,Hi guys. So this is the all new Chevy Equinox....,"[0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, ...",0.3,0.375,0.333333,0.428571,0.422727,42.857143


In [121]:
# results_df.to_clipboard()

In [123]:
label_df = pd.DataFrame(results_df['Predicted Labels'].tolist(), columns=questions)


In [124]:
score_df = pd.concat([results_df, label_df], axis=1)
score_df.shape


(150, 29)

In [125]:
score_df.to_excel('results/RAG_approach.xlsx', index=False)