In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold
from scipy.stats import hmean
from tqdm import tqdm
import pandas as pd
import re
import time
import os
import openai
from openai.error import RateLimitError, APIConnectionError, InvalidRequestError

#### GPT-4 configuration

In [None]:
azure_endpoint = "https://rumi-gpt4.openai.azure.com/"
api_key = ""
api_version = "2024-02-15-preview"
openai.api_type = "azure"
openai.api_base = azure_endpoint
openai.api_key = api_key
openai.api_version = api_version

In [None]:
model_name = "jinaai/jina-embeddings-v2-base-en"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to generate embeddings for a given text
def get_embedding(model, tokenizer, text, max_length=8192):
    """
    Generates embeddings for the given text using a transformer-based model.
    """
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**tokens)
        # Average the last hidden state across token dimensions to create the embedding
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
    return embedding

In [None]:
def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

def clean_text_with_tags(text):
    text_no_tags = re.sub(r'<\/?.*?>', '', text)  # Remove <ata> and similar tags
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text_no_tags)  # Remove non-alphanumeric characters
    return re.sub(r'\s+', ' ', cleaned_text).strip().lower()  # Normalize spaces and convert to lowercase

In [None]:
def load_upra_data(datapath, debugging=False):
    """
    Loads data from the provided Excel file and formats it for processing.
    """
    upra_df = pd.read_excel(datapath)
    if debugging:
        print("Debugging Mode: Using few document samples only")
        upra_df = upra_df.head(5)
    
    data = []
    with tqdm(total=len(upra_df), desc="Processing user profiles") as pbar:
        for _, row in upra_df.iterrows():
            data.append(
                {
                    # 'review' will only have one atypical aspect tagged and only one label in 'A_prime'
                    "raw_chunk": f"U: {row['user_profile']}\n\nR: {row['reformulated_review_sentence']}\n\nOutput:\n\n{row['A_prime']}",
                    #"raw_chunk": f"U: {row['user_profile']}\n\nR: {row['review']}\n\nOutput:\n\n{row['A_prime']}",
                    "user_profile": row['user_profile'],
                    "name": row['name'],
                    "business_id": row['business_id'],
                    "review": row['reformulated_review_sentence'],
                    #"review": row['review'],
                    "true_strong_weak": row['true_strong_weak'],
                    "abs_true_strong_weak": row['abs_true_strong_weak'],
                    "output": row['A_prime'],
                    "doc_id": row.get("doc_id", None),
                    "chunk_id": row.get("chunk_id", None),
                    "original_index": row.get("original_index", None),
                }
            )
            pbar.update(1)
    return data

def precompute_embeddings_and_index(data, model, tokenizer):
    """
    Precompute embeddings for the data and index by A_prime labels.
    """
    train_data_by_label = {"None": [], "Low": [], "Medium": [], "High": []}

    for item in data:
        # Enrich the existing dictionary with embeddings
        item['vector_user_profile'] = get_embedding(model, tokenizer, clean_text_with_tags(item['user_profile']))
        item['vector_abs_true_strong_weak'] = get_embedding(model, tokenizer, clean_text_with_tags(item['abs_true_strong_weak']))

        # Index by A_prime label
        if item['output'].endswith('"None")'):
            train_data_by_label["None"].append(item)
        elif item['output'].endswith('"Low")'):
            train_data_by_label["Low"].append(item)
        elif item['output'].endswith('"Medium")'):
            train_data_by_label["Medium"].append(item)
        elif item['output'].endswith('"High")'):
            train_data_by_label["High"].append(item)
    
    '''print("Count of items in each label:")
    for label, items in train_data_by_label.items():
        print(f"{label}: {items} and count: {len(items)}")'''

    return train_data_by_label

def calculate_harmonic_mean_and_top_k(test_query, train_data, top_k=1):
    """
    Calculates the harmonic mean of similarities for each label group and retrieves top k examples.
    """
    test_user_profile_embedding = get_embedding(model, tokenizer, clean_text_with_tags(test_query['user_profile']))
    test_abs_true_strong_weak_embedding = get_embedding(model, tokenizer, clean_text_with_tags(test_query['abs_true_strong_weak']))
    print(f"Test-True Strong Weak: {test_query['true_strong_weak']}")
    few_shot_examples = []

    #print("\nProcessing Test Query:")
    #print(f"User Profile: {test_query['user_profile']}")
    #print(f"Review: {test_query['review']}\n")

    for label, examples in train_data.items():
        if examples:
            print(f"Processing Label: {label}")
            user_profile_similarities = cosine_similarity(
                [test_user_profile_embedding],
                [doc['vector_user_profile'] for doc in examples]
            )[0]
            abs_true_strong_weak_similarities = cosine_similarity(
                [test_abs_true_strong_weak_embedding],
                [doc['vector_abs_true_strong_weak'] for doc in examples]
            )[0]
            harmonic_mean_scores = hmean([user_profile_similarities, abs_true_strong_weak_similarities], axis=0)

            #print(f"Cosine Similarities (User Profile): {user_profile_similarities}")
            #print(f"Cosine Similarities (Abs Strong Weak): {abs_true_strong_weak_similarities}")
            #print(f"Harmonic Mean Scores: {harmonic_mean_scores}")

            # Get top-k examples
            top_indices = harmonic_mean_scores.argsort()[-top_k:][::-1]
            top_examples = [examples[i] for i in top_indices]
            few_shot_examples.extend(top_examples)

            print(f"Top {top_k} Examples for Label '{label}':")
            for idx, example in enumerate(top_examples):
                print(f"Example {idx + 1}: {example['raw_chunk']} \n Training-True Strong Weak: {example['true_strong_weak']} (Score: {harmonic_mean_scores[top_indices[idx]]})\n")

    return few_shot_examples

def process_results_to_excel(results, output_file):
    """
    Process the results and save them to an Excel file.
    
    Parameters:
        results (list): List of dictionaries with prediction results.
        output_file (str): Path to save the output Excel file.
    """
    corrected_results = []

    # Filter out any entries with unexpected structure
    for r in results:
        if all(key in r for key in ['user_profile', 'name', 'business_id', 'review', 'predicted_output']):
            corrected_results.append(r)
        else:
            print(f"Skipping incorrect entry: {r}")

    results_df = pd.DataFrame(corrected_results, columns=['user_profile', 'name', 'business_id', 'review', 'predicted_output'])

    # Function to parse predicted_output into A_prime and Explanation
    def parse_predicted_output(result):
        if isinstance(result, str):
            if '\n\nExplanation: ' in result:
                parts = result.split('\n\nExplanation: ', 1)
                return [parts[0], parts[1]]
            else:
                return [result, '']
        elif isinstance(result, list):
            # If it's a list (e.g., multiple A' values), handle
            a_prime = '; '.join(str(item) for item in result)
            return [a_prime, '']
        else:
            # Handle unexpected formats
            return [str(result), '']

    results_df[['A_prime', 'Explanation']] = results_df['predicted_output'].apply(lambda x: pd.Series(parse_predicted_output(x)))

    # Drop the original predicted_output column
    results_df.drop(columns=['predicted_output'], inplace=True)

    results_df.to_excel(output_file, index=False)
    print(f"Results successfully exported to {output_file}")

In [None]:
def load_prompt_params(user_profile, review):
    
    """
    Constructs the query-specific string for the prompt.
    """
    prompt_params = f"""
    
    U: '{user_profile}'
    
    R: '{review}' 

    Output:
    """
    return prompt_params

In [None]:
def generate_model_response(prompt, prompt_params, test_instance):
    try:
        messages = [
            {"role": "system", "content": "You are a helpful AI assistant"},
            {"role": "user", "content": prompt},
            {"role": "user", "content": prompt_params}
        ]
        
        formatted_messages = "\n".join([f"{message['role']}: {message['content']}" for message in messages])
        print("Messages being sent to GPT:\n", formatted_messages)
        
        response = openai.ChatCompletion.create(
            messages=messages,
            engine="gpt-4",
            temperature=0,
            frequency_penalty=0,
            presence_penalty=0,
            max_tokens=100
        )
        result_content = response['choices'][0]['message']['content']
        return {
            "user_profile": test_instance['user_profile'],
            "name": test_instance['name'],
            "business_id": test_instance['business_id'],
            "review": test_instance['review'],
            "abs_true_strong_weak": test_instance['abs_true_strong_weak'],
            "predicted_output": result_content,
        }
    except (InvalidRequestError, RateLimitError, APIConnectionError) as e:
        print(f"Error encountered: {e}")
        time.sleep(150) if isinstance(e, RateLimitError) else None
    except Exception as e:
        print("An unexpected error occurred:", e)
    return None

### Zero-Shot

In [None]:
def load_prompt():
    """
    Reads and formats the prompt template with the few-shot examples.
    """
    with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/mapping/prompt_for_matching_restaurant_review_sentences_to_user_profiles_zs.txt', 'r') as file:
        prompt = file.read()

    return prompt

def zero_shot_inference(data):
    system_labels = []
    prompt = load_prompt()

    for test_instance in data:
        test_user_profile = test_instance['user_profile']
        test_review = test_instance['review']

        prompt_params = load_prompt_params(test_user_profile, test_review)

        system_label = generate_model_response(prompt, prompt_params, test_instance)

        if system_label:
            print(system_label['predicted_output'])
            system_labels.append(system_label)

    return system_labels

### Fixed Few-Shot with CoT

In [None]:
def load_prompt_cot():
    """
    Reads and formats the prompt template with the few-shot examples.
    """
    with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/mapping/prompt_for_matching_restaurant_review_sentences_to_user_profiles.txt', 'r') as file:
        prompt = file.read()
        
        with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/mapping/few_shot_examples_restaurants.txt', 'r') as fewshot_file:
            few_shot_examples = fewshot_file.read()

    return prompt.format(few_shot_examples=few_shot_examples)

def few_shot_CoT_inference(data):
    system_labels = []
    prompt = load_prompt_cot()

    for test_instance in data:
        test_user_profile = test_instance['user_profile']
        test_review = test_instance['review']

        prompt_params = load_prompt_params(test_user_profile, test_review)

        system_label = generate_model_response(prompt, prompt_params, test_instance)

        if system_label:
            print(system_label['predicted_output'])
            system_labels.append(system_label)

    return system_labels

### Few-Shot with RAG

In [None]:
def load_prompt_rag(few_shot_examples):
    """
    Reads and formats the prompt template with the few-shot examples.
    """
    with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/mapping/prompt_for_matching_restaurant_review_sentences_to_user_profiles.txt', 'r') as file:
        prompt = file.read()

    return prompt.format(few_shot_examples=few_shot_examples)

def loocv_inference(data, model):
    """
    Performs leave-one-out cross-validation (LOOCV) inference.
    Ensures that for each test instance, no record in the training set shares the same user profile or business_id.
    This approach minimizes the risk of data leakage by strictly filtering similar records.
    This updated version precomputes embeddings once before processing test instances.
    """
    system_labels = []
    
    # Precompute embeddings and index training data before the loop
    print("Precomputing embeddings for the entire dataset...")
    train_data_by_label = precompute_embeddings_and_index(data, model, tokenizer)
    
    # Iterate over each record to perform LOOCV
    for i, test_instance in enumerate(data):
        print(f"\nProcessing Test Instance {i + 1} of {len(data)}...")
        
        # Define the user profile and review of the current test instance
        test_profile = clean_text_with_tags(test_instance['user_profile'])
        test_business_id = test_instance['business_id'].strip()

        # Filter out records with the same user_profile OR same business_id
        filtered_train_data_by_label = {
            label: [
                item for item in train_data_by_label[label]
                if clean_text_with_tags(item['user_profile']) != test_profile
                and item['business_id'].strip() != test_business_id
            ]
            for label in train_data_by_label
        }
        
        print(f"Number of records in training set after filtering: {sum(len(v) for v in filtered_train_data_by_label.values())}")
        print(f"Excluding records with profile: {test_profile} and business id: {test_business_id}")
        
        # Calculate few-shot examples using precomputed and filtered training data
        few_shot_examples = calculate_harmonic_mean_and_top_k(test_instance, filtered_train_data_by_label)
        few_shot_formatted = "\n\n".join(
            [f"Example {k+1}:\n\n{doc['raw_chunk']}" for k, doc in enumerate(few_shot_examples)]
        )
        
        prompt = load_prompt_rag(few_shot_formatted)
        prompt_params = load_prompt_params(test_instance['user_profile'], test_instance['review'])
        system_label = generate_model_response(prompt, prompt_params, test_instance)
        
        if system_label:
            print(system_label['predicted_output'])
            system_labels.append(system_label)

    return system_labels

### Zero-Shot Inference

In [None]:
upra_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/mturk/mturk-marketplace-ready/test/system_results/restaurants/ground_truth_w_split_abstractive_rag.xlsx"
data_to_process = load_upra_data(upra_path, debugging=False)
results_zs = zero_shot_inference(data_to_process)
output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/mturk/mturk-marketplace-ready/test/system_results/restaurants/system_user_profiles_restaurants_reviews_match_zs.xlsx"
process_results_to_excel(results_zs, output_file)

### CoT Few-Shot Inference

In [None]:
upra_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/mturk/mturk-marketplace-ready/test/system_results/restaurants/ground_truth_w_split_abstractive_rag.xlsx"
data_to_process = load_upra_data(upra_path, debugging=False)
results_cot = few_shot_CoT_inference(data_to_process)
output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/mturk/mturk-marketplace-ready/test/system_results/restaurants/system_user_profiles_restaurants_reviews_match_cot.xlsx"
process_results_to_excel(results_cot, output_file)

### RAG Leave-one-out CV Inference

In [None]:
upra_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/mturk/mturk-marketplace-ready/test/system_results/restaurants/ground_truth_w_split_abstractive_rag.xlsx"
data_to_process = load_upra_data(upra_path, debugging=False)
results_rag = loocv_inference(data_to_process, model)
output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/mturk/mturk-marketplace-ready/test/system_results/hotels/system_user_profiles_restaurants_reviews_match_rag.xlsx"
process_results_to_excel(results_rag, output_file)