In [None]:
#Import libraies
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.evaluation import load_evaluator


In [None]:
system_prompt = "Rewrite the following disfluent question into a fluent and understandable question without answering it."

In [None]:
def infere(data, out_path):
    '''
    Inference code for dataframe
    '''
    pred = []  # Initialize a list to store predictions
    for index, row in data.iterrows():
        print(f"{index}/{len(data)}")  # Print the current progress
        result = get_completion(query=row['disfluent'], model=model, tokenizer=tokenizer)  # Get the completion result
        result = result[result.find('assistant\n\n') + 11 :]
        pred.append(result[:result.find('?') + 1])  # Extract the relevant part of the result
    data['llama'] = pred  # Add the predictions to the DataFrame

    data.to_csv(out_path, index=False)  # Save the DataFrame to a CSV file

def model_eval(data, out_path):
    '''
    Model evaluation based on exact match, embedding distance, and string distance.
    '''
    embedding_model = HuggingFaceEmbeddings()  # Initialize the embedding model
    hf_evaluator = load_evaluator("embedding_distance", embeddings=embedding_model)  # Load the embedding distance evaluator
    evaluator = load_evaluator("exact_match", ignore_case=True, ignore_numbers=True, ignore_punctuation=True)  # Load the exact match evaluator
    sm_evaluator = load_evaluator("string_distance")  # Load the string distance evaluator
    
    dist = []  # Initialize a list to store embedding distances
    exact = []  # Initialize a list to store exact match scores
    string_match = []  # Initialize a list to store string match scores

    for i in range(len(data)):
        dist.append(hf_evaluator.evaluate_strings(prediction=data.iloc[i]['llama_revised'].strip(), reference=data.iloc[i]['original'].strip())['score'])  # Evaluate embedding distance
        exact.append(evaluator.evaluate_strings(prediction=data.iloc[i]['llama_revised'].strip(), reference=data.iloc[i]['original'].strip())['score'])  # Evaluate exact match
        string_match.append(sm_evaluator.evaluate_strings(prediction=data.iloc[i]['llama_revised'].strip(), reference=data.iloc[i]['original'].strip())['score'])  # Evaluate string match
        
    data['embed_dist'] = dist  # Add embedding distances to the DataFrame
    data['exact_match'] = exact  # Add exact match scores to the DataFrame
    data['string_distance'] = string_match  # Add string match scores to the DataFrame
    
    # Print evaluation metrics
    print(f"Exact match: {sum(data['exact_match'].tolist())}")
    print(f"Embedding distance: {round(sum(data['embed_dist'].tolist()) / len(data), 4)}")
    print(f"String distance: {round(sum(data['string_distance'].tolist()) / len(data), 4)}")
    
    data.to_csv(out_path, index=False)  # Save the DataFrame to a CSV file

def get_completion(query, model, tokenizer):
    '''
    Inferencing function for single instance.
    '''
    device = "cuda:0"  # Specify the device to use for computation
    
    conversation = []  # Initialize a list to store the conversation
    # Define the system and user messages
    system_message = {"role": "system", "content": system_prompt}
    user_message = {"role": "user", "content": query}

    # Insert the system message at the beginning of the conversation
    conversation.insert(0, system_message)
    # Append the user message to the conversation
    conversation.append(user_message)
    
    # Format the conversation using the tokenizer
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
    
    # Encode the prompt into tensors
    encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

    # Move the encoded inputs to the specified device
    model_inputs = encodeds.to(device)

    # Generate a response from the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=256, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    # Decode the generated response
    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return decoded  # Return the decoded response

def json_to_df(data):
    '''
    Convert data from json to dataframe.
    '''
    # Initialize lists to store 'original' and 'disfluent' values
    o = []
    d = []
    
    # Iterate through each key in the data dictionary
    for key in data.keys():
        # Append the 'original' and 'disfluent' values to their respective lists
        o.append(data[key]['original'])
        d.append(data[key]['disfluent'])

    # Create a DataFrame from the lists
    df = pd.DataFrame()
    df['original'] = o
    df['disfluent'] = d
    
    # Return the DataFrame
    return df

In [None]:
# Read json files.
with open("train.json" , "r") as f:
    train_json = json.loads(f.read())

with open("dev.json" , "r") as f:
    dev_json = json.loads(f.read())

In [None]:
# Convert json to dataframe
train = json_to_df(train_json)
dev = json_to_df(dev_json)

In [None]:
# Load pre-trained model and tokenizer
model = AutoModelForCausalLM.from_pretrained("merged_model_llama")
tokenizer = AutoTokenizer.from_pretrained("merged_model_llama")

In [None]:
# Get output for train data
infere(train,"train_pred.csv")

In [None]:
# Evaluation for train data
model_eval(train,"train_pred.csv")

In [None]:
# Get output for dev data
infere(dev,"dev_pred.csv")

In [None]:
# Evaluation for dev data
model_eval(dev,"dev_pred.csv")