In [1]:
import os
import re
import gc
import json
import torch
import pandas as pd
from getpass import getpass
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama, HuggingFaceHub, HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
torch.cuda.empty_cache()
gc.collect()
# print(GPUtil.showUtilization())
# OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = 'sk-proj-Y2nqeGoOCxTZnITAzuFdT3BlbkFJf6Rm2tmkcssXIov3PMFQ'
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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'
os.environ['LANGCHAIN_TRACING_V2']="true"
os.environ['LANGCHAIN_ENDPOINT']="https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY']="ls__93636df794f14ccba7162354d46779d8"
os.environ['LANGCHAIN_PROJECT']="LLM_Context_Effects"

In [2]:
model_hf_repo_id_mapping = {
    'mistral_7B': "mistralai/Mistral-7B-Instruct-v0.2",
    'llama2_7B':'meta-llama/Llama-2-7b-chat-hf',
    'llama2_13B': 'meta-llama/Llama-2-13b-chat-hf',
    'llama2_70B': 'meta-llama/Llama-2-70b-chat-hf',
    'llama3_8B':'meta-llama/Meta-Llama-3-8B-Instruct',
    'llama3_70B': 'meta-llama/Meta-Llama-3-70B-Instruct'
}

model_ollama_id_mapping = {
    'mistral_7B': "mistral:7b-instruct-fp16",
    'llama2_7B': 'llama2:7b-chat-fp16',
    'llama2_13B': 'llama2:13b-chat-fp16',
    'llama2_70B': 'llama2:70b-chat-fp16',
    'llama3_8B':'llama3:8b-instruct-fp16',
    'llama3_70B': 'llama3:70b-instruct-fp16'
}

model_name_type_mapping={
    'gpt-3.5-turbo': 'openai',
    'gpt-4': 'openai',
    'mistral_7B': 'open-source',
    'llama2_7B': 'open-source',
    'llama2_13B': 'open-source',
    'llama2_70B': 'open-source',
    'llama3_8B': 'open-source',
    'llama3_70B': 'open-source',
}

def initialise_openai_models(model_name, temperature):
    model = ChatOpenAI(model=model_name, api_key=os.environ["OPENAI_API_KEY"], temperature=temperature, max_tokens=20)
    return model

def initialise_open_source_models_transformers(model_name, temperature):
    # Use a pipeline as a high-level helper
    repo_id = model_hf_repo_id_mapping[model_name]
    pipe = pipeline("text-generation",
                    model=repo_id,
                    token=os.environ['HUGGINGFACEHUB_API_TOKEN'],
                    device_map = "sequential", max_new_tokens = 22,
                    do_sample = True,
                    return_full_text = False,
                    temperature = temperature,
                    top_k = 50,
                    top_p = 0.9)
    return  HuggingFacePipeline(pipeline=pipe)

def initialise_open_source_models_ollama(model_name, temperature):
    ollama_id = model_ollama_id_mapping[model_name]
    model = Ollama(base_url='http://localhost:11434',
    model=ollama_id, temperature = temperature, num_predict = 20, format = 'json', num_gpu=-1)
    print(model)
    return model


def initialise_models(model_name = 'mistral_7B', model_type = 'openai', temperature= 0.0):
    if model_type == 'openai':
        return initialise_openai_models(model_name, temperature)
    else:
        return initialise_open_source_models_ollama(model_name, temperature)


In [3]:
temperatures = [0.001, 0.5, 1.0, 1.5]
similarity_effect_country_pairs = [
('U.S.A.', 'Mexico'),
('U.S.S.R.', 'Poland'),
('China', 'Albania'),
('U.S.A.', 'Israel'),
('Japan', 'Philippines'),
('U.S.A.', 'Canada'),
('U.S.S.R.', 'Israel'),
('England', 'Ireland'),
('Germany', 'Austria'),
('U.S.S.R.', 'France'),
('Belgium', 'Luxembourg'),
('U.S.A.', 'U.S.S.R.'),
('China', 'North Korea'),
('India', 'Sri Lanka'),
('U.S.A.', 'France'),
('U.S.S.R.', 'Cuba'),
('England', 'Jordan'),
('France', 'Israel'),
('U.S.A.', 'Germany'),
('U.S.S.R.', 'Syria'),
('France', 'Algeria')]

questions_order_1 = {}
questions_order_2 = {}

questions_order_similar_to_1 = {}
questions_order_similar_to_2 = {}
questions_order_similar_degree_1 = {}
questions_order_similar_degree_2 = {}
for country1, country2 in similarity_effect_country_pairs:
    order_1 = f'{country1}-{country2}'
    order_2 = f'{country2}-{country1}'
    questions_order_1[order_1] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, how similar are {country1} and {country2}? Shape: score: int"
    questions_order_2[order_2] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, how similar are {country2} and {country1}? Shape: score: int"
    questions_order_similar_to_1[order_1] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is {country1} similar to {country2}? Shape: score: int"
    questions_order_similar_to_2[order_2] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is {country2} similar to {country1}? Shape: score: int"
    questions_order_similar_degree_1[order_1] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, assess the degree to which {country1} similar to {country2}? Shape: score: int"
    questions_order_similar_degree_2[order_2] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, assess the degree to which {country2} similar to {country1}? Shape: score: int"
print(questions_order_similar_to_1)


{'U.S.A.-Mexico': 'On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is U.S.A. similar to Mexico? Shape: score: int', 'U.S.S.R.-Poland': 'On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is U.S.S.R. similar to Poland? Shape: score: int', 'China-Albania': 'On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is China similar to Albania? Shape: score: int', 'U.S.A.-Israel': 'On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is U.S.A. similar to Israel? Shape: score: int', 'Japan-Philippines': 'On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is Japan similar to Philippines? Shape: score: int', 'U.S.A.-Canada': 'On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is U.S.A. similar to Canada? Shape: score: int', 'U.S.S.R.-Israel': 'On a scale of 0 to 20, where 0 means no similarity and 

In [4]:
template = """Answer the following two questions to the best of your knowledge. In each case, your answer should be a well-formed json of shape provided.
Question 1: {text_1}
Question 2: {text_2}
Answer: score_1: int, score_2: int
"""
modified_template = template = """Answer the following two questions to the best of your knowledge. In each case, your answer should be a well-formed json of shape provided.
                                Question 1: {text_1}
                                Question 2: {text_2}
                                Answer: score_1: int, score_2: int
                                Please provide integer score to both the questions. 
                                """
prompt = ChatPromptTemplate.from_template(template)
modified_prompt = ChatPromptTemplate.from_template(modified_template)

In [5]:
models = [
    'mistral_7B',
     'llama2_7B',
  'llama3_8B',
 'llama2_13B',
 'gpt-3.5-turbo',
 'gpt-4']

models = ['llama2_70B']


In [6]:
def parse_numeric_output(raw_output):
    match = re.search(r'\d+', raw_output)
    # print(match, match.group())
    if match:
        return match.group()
    return None
def parse_json_to_numeric(raw_output):
# Define the regular expression pattern
    pattern = r'"score_\d+": (\d+)'

    # Find all matches using the pattern
    matches = re.findall(pattern, raw_output)

    # Convert the matches to integers
    scores = [int(match) for match in matches]

    # Print the extracted integer values
    print("Integer values extracted from JSON:", scores)
    return scores
    


In [7]:
results_dict_columns = {
    'country_pair': '',
    'prompt_style': 'dual',
    'model_name': '',
    'temperature': '',
    'sim_score_1': [],
    'sim_score_2': [],
    'sim_diff': [],
    'p-values': []
}
# Define the file path
file_path = './results/results_dual_prompt_similar_degree.csv'

# Check if the file exists
if os.path.isfile(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
else:
    # Create an empty DataFrame from the dictionary
    df = pd.DataFrame(columns=results_dict_columns)

# Print the DataFrame
# print(df)

In [8]:
def create_chain(prompt, model, order, model_name, temperature):
    return (prompt | model | StrOutputParser()).with_config({
"metadata": {
    'country-pair-order': order,
    'model_name':model_name,
    'temperature': temperature,
    'prompt_style': 'dual'
}}
)

def get_output(prompt, model, order, model_name, temperature, ques_1, ques_2):
    chain = create_chain(prompt, model, order, model_name, temperature)
    # print(ques_1)
    return chain.invoke({"text_1": ques_1, "text_2": ques_2})

In [9]:

for model_name in models:
    model_type = model_name_type_mapping[model_name]
    results_dict_columns['model_name'] = model_name
    for temperature in temperatures:
        model = initialise_models(model_name, model_type, temperature)
        results_dict_columns['temperature'] = temperature
        for order_1, order_2 in zip(questions_order_similar_to_1, questions_order_similar_to_2):
            # print(order_1, order_2)
            results_dict_columns['country_pair'] = order_1
            ques_1 = questions_order_similar_degree_1[order_1]
            ques_2 = questions_order_similar_degree_2[order_2]
            output = get_output(prompt, model, order_1, model_name, temperature, ques_1, ques_2)
            print(output)
            parsed_output = parse_json_to_numeric(output)
            if parsed_output:
                if len(parsed_output) == 2:
                    sim_score_1, sim_score_2 = parsed_output
            retry_count = 0
            while(not parsed_output or len(parsed_output) != 2 or
                    parsed_output[0] > 20 or parsed_output[0] < 0
                      or parsed_output[1] > 20 or parsed_output[1] < 0):
                print(f'Issue with output parsing for Model_name: {model_name}, Pair: {order_2}, Temperature: {temperature} output: {output}. Modifying prompt and calling LLM again ')
                output = get_output(modified_prompt, model, order_1, model_name, temperature, ques_1, ques_2)
                print(f'new output with modified prompt is {output}')
                parsed_output = parse_json_to_numeric(output)
                retry_count+=1
                if retry_count == 5:
                    print('Tried to modify prompt too many times, now giving up')
                    break
            if  retry_count!=5:
                sim_score_1, sim_score_2 = parsed_output
            else:
                sim_score_1 = sim_score_2 = None
                print(f' cannot parse output {output} for Model_name: {model_name}, Pair: {order_1}, Temperature: {temperature}')

            if sim_score_1 != None and sim_score_2 != None:
                sim_diff = sim_score_1 - sim_score_2
            else:
                sim_diff = None
            print(f'for Model_name: {model_name}, Pair: {order_2}, Temperature: {temperature}, output1: {sim_score_1}, output2: {sim_score_2}')
            results_dict_columns['sim_score_1'] = sim_score_1
            results_dict_columns['sim_score_2'] = sim_score_2
            results_dict_columns['sim_diff'] = sim_diff
            df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])
        # df.to_csv(file_path, index=False, mode='a')    
    # del model
        # print('model deleted..')
        # gc.collect()
        # torch.cuda.empty_cache()
                
                

[1mOllama[0m
Params: {'model': 'llama2:70b-chat-fp16', 'format': 'json', 'options': {'mirostat': None, 'mirostat_eta': None, 'mirostat_tau': None, 'num_ctx': None, 'num_gpu': -1, 'num_thread': None, 'num_predict': 20, 'repeat_last_n': None, 'repeat_penalty': None, 'temperature': 0.001, 'stop': None, 'tfs_z': None, 'top_k': None, 'top_p': None}, 'system': None, 'template': None, 'keep_alive': None}
{
"score_1": 10,
"score_2": 15
Integer values extracted from JSON: [10, 15]
for Model_name: llama2_70B, Pair: Mexico-U.S.A., Temperature: 0.001, output1: 10, output2: 15
{
"score_1": 12,
"score_2": 15
Integer values extracted from JSON: [12, 15]
for Model_name: llama2_70B, Pair: Poland-U.S.S.R., Temperature: 0.001, output1: 12, output2: 15
{
"score_1": 8,
"score_2": 12

Integer values extracted from JSON: [8, 12]
for Model_name: llama2_70B, Pair: Albania-China, Temperature: 0.001, output1: 8, output2: 12
{
"score_1": 15,
"score_2": 18
Integer values extracted from JSON: [15, 18]
for Model_n

In [10]:
df.to_csv(file_path, index=False, mode='w')

In [11]:
del model
print('model deleted..')
gc.collect()
torch.cuda.empty_cache()

model deleted..
