In [6]:
import os
import re
import gc
import json
import torch
import pandas as pd
from getpass import getpass
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama, HuggingFaceHub, HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
torch.cuda.empty_cache()
gc.collect()
# print(GPUtil.showUtilization())
# OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = 'sk-proj-Y2nqeGoOCxTZnITAzuFdT3BlbkFJf6Rm2tmkcssXIov3PMFQ'
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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'
os.environ['LANGCHAIN_TRACING_V2']="true"
os.environ['LANGCHAIN_ENDPOINT']="https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY']="ls__93636df794f14ccba7162354d46779d8"
os.environ['LANGCHAIN_PROJECT']="LLM_Context_Effects"

In [7]:
model_hf_repo_id_mapping = {
    'mistral_7B': "mistralai/Mistral-7B-Instruct-v0.2",
    'llama2_7B':'meta-llama/Llama-2-7b-chat-hf',
    'llama2_13B': 'meta-llama/Llama-2-13b-chat-hf',
    'llama2_70B': 'meta-llama/Llama-2-70b-chat-hf',
    'llama3_8B':'meta-llama/Meta-Llama-3-8B-Instruct',
    'llama3_70B': 'meta-llama/Meta-Llama-3-70B-Instruct'
}

model_ollama_id_mapping = {
    'mistral_7B': "mistral:7b-instruct-fp16",
    'llama2_7B': 'llama2:7b-chat-fp16',
    'llama2_13B': 'llama2:13b-chat-fp16',
    'llama2_70B': 'llama2:70b-chat-fp16',
    'llama3_8B':'llama3:8b-instruct-fp16',
    'llama3_70B': 'llama3:70b-instruct-fp16'
}

model_name_type_mapping={
    'gpt-3.5-turbo': 'openai',
    'gpt-4': 'openai',
    'mistral_7B': 'open-source',
    'llama2_7B': 'open-source',
    'llama2_13B': 'open-source',
    'llama2_70B': 'open-source',
    'llama3_8B': 'open-source',
    'llama3_70B': 'open-source',
}

def initialise_openai_models(model_name, temperature):
    model = ChatOpenAI(model=model_name, api_key=os.environ["OPENAI_API_KEY"], temperature=temperature, max_tokens=200)
    return model

def initialise_open_source_models_transformers(model_name, temperature):
    # Use a pipeline as a high-level helper
    repo_id = model_hf_repo_id_mapping[model_name]
    pipe = pipeline("text-generation",
                    model=repo_id,
                    token=os.environ['HUGGINGFACEHUB_API_TOKEN'],
                    device_map = "sequential", max_new_tokens = 10,
                    do_sample = True,
                    return_full_text = False,
                    temperature = temperature,
                    top_k = 50,
                    top_p = 0.9)
    return  HuggingFacePipeline(pipeline=pipe)

def initialise_open_source_models_ollama(model_name, temperature):
    ollama_id = model_ollama_id_mapping[model_name]
    model = Ollama(base_url='http://localhost:11434',
    model=ollama_id, temperature = temperature, num_predict = 250, format = 'json', num_gpu=-1)
    print(model)
    return model


def initialise_models(model_name = 'mistral_7B', model_type = 'openai', temperature= 0.0):
    if model_type == 'openai':
        return initialise_openai_models(model_name, temperature)
    else:
        return initialise_open_source_models_ollama(model_name, temperature)


In [8]:
template = """Answer the following question to the best of your knowledge. Your answer should be a json of shape provided
Text: {text}
"""
modified_template = """Answer the following question to the best of your knowledge.  Your answer should be a json of shape provided.
                                Text: {text}
                                Please provide an integer score.
                     """
template_cot = """Answer the following question to the best of your knowledge. Your answer should be a json of shape provided. Also, mention how you arrived at the score.
Text: {text}
Lets think step by step.
"""
prompt = ChatPromptTemplate.from_template(template)
modified_prompt = ChatPromptTemplate.from_template(modified_template)
prompt_cot = ChatPromptTemplate.from_template(template_cot)

In [9]:
temperatures = [0.001, 0.5, 1.0, 1.5]
similarity_effect_country_pairs = [
('U.S.A.', 'Mexico'),
('U.S.S.R.', 'Poland'),
('China', 'Albania'),
('U.S.A.', 'Israel'),
('Japan', 'Philippines'),
('U.S.A.', 'Canada'),
('U.S.S.R.', 'Israel'),
('England', 'Ireland'),
('Germany', 'Austria'),
('U.S.S.R.', 'France'),
('Belgium', 'Luxembourg'),
('U.S.A.', 'U.S.S.R.'),
('China', 'North Korea'),
('India', 'Sri Lanka'),
('U.S.A.', 'France'),
('U.S.S.R.', 'Cuba'),
('England', 'Jordan'),
('France', 'Israel'),
('U.S.A.', 'Germany'),
('U.S.S.R.', 'Syria'),
('France', 'Algeria')]

SSA_order_1 = {}
SSA_order_2 = {}

SST_order_1 = {}
SST_order_2 = {}

SSD_order_1 = {}
SSD_order_2 = {}


for country1, country2 in similarity_effect_country_pairs:
    order_1 = f'{country1}-{country2}'
    order_2 = f'{country2}-{country1}'
    SSA_order_1[order_1] = f"""
    On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, how similar are {country1} and {country2}?
    Please reason in 100 words about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int"""


    SSA_order_2[order_2] = f"""
    On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, how similar are {country2} and {country1}?
    Please reason in 100 words about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int"""

    SST_order_1[order_1] = f"""
    On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is {country1} similar to {country2}?
    Please reason in 100 words about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int"""


    SST_order_2[order_2] = f"""
    On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is {country2} similar to {country1}?
    Please reason in 100 words about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int"""

    SSD_order_1[order_1] = f"""On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, assess the degree to which {country1} is similar to {country2}?
    Please reason in 100 words about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int"""
    SSD_order_2[order_2] = f"""On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, assess the degree to which {country2} is similar to {country1}? 
    Please reason about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int"""

# print(questions_order_similar_degree_2)


In [10]:
results_dict_columns = {
    'country_pair': '',
    'prompt_style': 'cot',
    'model_name': '',
    'temperature': '',
    'sim_score_1': [],
    'sim_score_2': [],
    'sim_diff': [],
    'cot_reasoning_1': '',
    'cot_reasoning_2': '',
    'p-values': []
}
# Define the file path
file_path = './results/results_cot_prompt_aligned_models.csv'

# Check if the file exists
if os.path.isfile(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
else:
    # Create an empty DataFrame from the dictionary
    df = pd.DataFrame(columns=results_dict_columns)

# Print the DataFrame
print(df)

    Unnamed: 0       country_pair prompt_style model_name  temperature  \
0            0      U.S.A.-Mexico          cot  llama3_8B        0.001   
1            1    U.S.S.R.-Poland          cot  llama3_8B        0.001   
2            2      China-Albania          cot  llama3_8B        0.001   
3            3      U.S.A.-Israel          cot  llama3_8B        0.001   
4            4  Japan-Philippines          cot  llama3_8B        0.001   
..         ...                ...          ...        ...          ...   
58          58     England-Jordan          cot      gpt-4        0.500   
59          59      France-Israel          cot      gpt-4        0.500   
60          60     U.S.A.-Germany          cot      gpt-4        0.500   
61          61     U.S.S.R.-Syria          cot      gpt-4        0.500   
62          62     France-Algeria          cot      gpt-4        0.500   

    sim_score_1  sim_score_2  sim_diff  \
0            12           15        -3   
1            10            

In [2]:
def create_chain(prompt, model, order, model_name, temperature):
    return (prompt | model | StrOutputParser()).with_config({
"metadata": {
    'country-pair-order': order,
    'model_name':model_name,
    'temperature': temperature,
}}
)

def get_output(prompt, model, order, model_name, temperature, ques):
    chain = create_chain(prompt, model, order, model_name, temperature)
    return chain.invoke({"text": ques})

def parse_numeric_output(raw_output):
    match = re.search(r'\d+', raw_output)
    # print(match, match.group())
    if match:
        return match.group()
    return None

def parse_cot_json_output(raw_output):
    raw_output = raw_output.replace("\n", " ")
    # json_dict = json.loads(raw_output)
    score = reasoning = None
    try:
        json_dict = json.loads(raw_output)
    except:
        print(f'error in parsing raw output to json. output is {raw_output}')
        return reasoning, score, True

    if 'score' in json_dict.keys():
        score = json_dict['score']

    if 'Reasoning' in json_dict.keys():
        reasoning = json_dict['Reasoning']

    return reasoning, score, False

    

In [3]:
prompt_style_template_mapping = {
    'SD': {'order_1': SSD_order_1, 'order_2': SSD_order_2},
    'ST':{'order_1': SST_order_1, 'order_2': SST_order_2},
    'SA':{'order_1': SSA_order_1, 'order_2': SSA_order_2}
}

NameError: name 'SSD_order_1' is not defined

In [24]:
# models = [ 
#     'mistral_7B',
#  'llama2_7B',
#   'llama3_8B',
#  'llama2_13B',
#   'gpt-3.5-turbo',
#  'gpt-4'
# ]
# models = ['llama3_70B']
# for model_name in models:
#     model_type = model_name_type_mapping[model_name]
#     results_dict_columns['model_name'] = model_name
#     for temperature in temperatures[:1]:
#         model = initialise_models(model_name, model_type, temperature)
#         results_dict_columns['temperature'] = temperature
#         for order_1, order_2 in zip(questions_order_similar_to_1, questions_order_similar_to_2):
#             results_dict_columns['country_pair'] = order_1
#             ques_1 = questions_order_similar_degree_1[order_1]
#             ques_2 = questions_order_similar_degree_2[order_2]
#             output_1 = get_output(prompt, model, order_1, model_name, temperature, ques_1)
#             output_2 = get_output(prompt, model, order_2, model_name, temperature, ques_2)
#             # print(output_1)
#             reasoning_1, sim_score_1, error_1 = parse_cot_json_output(output_1)
#             reasoning_2, sim_score_2, error_2 = parse_cot_json_output(output_2)
#             # if error_1 or error_2:
#             #     df.to_csv(file_path, index=False, mode='w')
#             if sim_score_1!=None and sim_score_2!=None:
#                 sim_diff = sim_score_1 - sim_score_2
#             else:
#                 sim_diff = None
#             print(f'for Model_name: {model_name}, Pair: {order_2}, Temperature: {temperature}, output1: {output_1}, output2: {output_2}')
#             results_dict_columns['sim_score_1'] = sim_score_1
#             results_dict_columns['sim_score_2'] = sim_score_2
#             results_dict_columns['sim_diff'] = sim_diff
#             results_dict_columns['cot_reasoning_1'] = reasoning_1
#             results_dict_columns['cot_reasoning_2'] = reasoning_2

#             df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])
#             # del model
#         # print('model deleted..')
#         # gc.collect()
#         # torch.cuda.empty_cache()
                
                

In [25]:
models = [ 
    'mistral_7B',
 'llama2_7B',
  'llama3_8B',
 'llama2_13B',
  'gpt-3.5-turbo',
 'gpt-4'
]
models = ['llama3_70B']

settings = [('llama3_8B', '0.001', 'SST'), ('llama3_8B', '0.5', 'SST'), ('gpt-4', '0.5', 'SSD') ]
for setting in settings:
    model_name = setting[0]
    temperature = float(setting[1])
    model_type = model_name_type_mapping[model_name]
    results_dict_columns['model_name'] = model_name
    model = initialise_models(model_name, model_type, temperature)
    print(f'model is {model_name}, temperature is {temperature}')
    results_dict_columns['temperature'] = temperature
    prompt_context = setting[2][0]
    prompt_style = ''.join(setting[2][1:])
    print(prompt_context)
    print(prompt_style)
    questions_1 = prompt_style_template_mapping[prompt_style]['order_1']
    questions_2 = prompt_style_template_mapping[prompt_style]['order_2']
    for order_1, order_2 in zip(questions_1, questions_2):
        print(f'order_1: {order_1}')
        results_dict_columns['country_pair'] = order_1
        ques_1 = questions_1[order_1]
        ques_2 = questions_2[order_2]
        print(f'ques_1: {ques_1}')
        output_1 = get_output(prompt, model, order_1, model_name, temperature, ques_1)
        output_2 = get_output(prompt, model, order_2, model_name, temperature, ques_2)
        # print(output_1)
        reasoning_1, sim_score_1, error_1 = parse_cot_json_output(output_1)
        reasoning_2, sim_score_2, error_2 = parse_cot_json_output(output_2)
        # if error_1 or error_2:
        #     df.to_csv(file_path, index=False, mode='w')
        if sim_score_1!=None and sim_score_2!=None:
            sim_diff = sim_score_1 - sim_score_2
        else:
            sim_diff = None
        print(f'for Model_name: {model_name}, Pair: {order_2}, Temperature: {temperature}, output1: {output_1}, output2: {output_2}')
        results_dict_columns['sim_score_1'] = sim_score_1
        results_dict_columns['sim_score_2'] = sim_score_2
        results_dict_columns['sim_diff'] = sim_diff
        results_dict_columns['cot_reasoning_1'] = reasoning_1
        results_dict_columns['cot_reasoning_2'] = reasoning_2

        df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])
            # del model
        # print('model deleted..')
        # gc.collect()
        # torch.cuda.empty_cache()
                
                

[1mOllama[0m
Params: {'model': 'llama3:8b-instruct-fp16', 'format': 'json', 'options': {'mirostat': None, 'mirostat_eta': None, 'mirostat_tau': None, 'num_ctx': None, 'num_gpu': -1, 'num_thread': None, 'num_predict': 250, 'repeat_last_n': None, 'repeat_penalty': None, 'temperature': 0.001, 'stop': None, 'tfs_z': None, 'top_k': None, 'top_p': None}, 'system': None, 'template': None, 'keep_alive': None}
model is llama3_8B, temperature is 0.001
S
ST
order_1: U.S.A.-Mexico
ques_1: 
    On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity,is U.S.A. similar to Mexico?
    Please reason in 100 words about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int
for Model_name: llama3_8B, Pair: Mexico-U.S.A., Temperature: 0.001, output1: {
"Reasoning": "U.S.A. and Mexico share a border, have similar cultural influences such as Spanish language and Catholicism, and have economic ties through tr

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


for Model_name: llama3_8B, Pair: Poland-U.S.S.R., Temperature: 0.001, output1: {
"Reasoning": "U.S.S.R. and Poland were both Eastern European countries with a shared history of being part of the Soviet sphere of influence during the Cold War era. They also share similar cultural and linguistic roots, with Polish being a Slavic language closely related to Russian. However, they had distinct political systems, economies, and histories, with Poland being a sovereign state since 1918 and U.S.S.R. being a socialist republic from 1922 to 1991.",
"score": 10
}, output2: {
"Reasoning": "Poland and U.S.S.R. (now Russia) have a complex history, with Poland being part of the Soviet sphere of influence during the Cold War era. Although they share some cultural similarities due to their shared Slavic heritage, they have distinct differences in terms of language, politics, economy, and geography. Poland has been an independent country since 1918, while U.S.S.R. was a socialist state that dissolved i

  warn_deprecated(


model is gpt-4, temperature is 0.5
S
SD
order_1: U.S.A.-Mexico
ques_1: On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, assess the degree to which U.S.A. is similar to Mexico?
    Please reason in 100 words about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int
for Model_name: gpt-4, Pair: Mexico-U.S.A., Temperature: 0.5, output1: {
    "Reasoning": ["The U.S.A. and Mexico have some similarities but also have many differences. They share a border and have significant cultural exchanges. Both have diverse populations and landscapes, from mountains to coasts. However, they differ greatly in terms of economic development, political systems, and social issues. The U.S.A. is a developed country with a high standard of living, while Mexico, despite being an emerging economy, still struggles with poverty and social inequality. Furthermore, the U.S.A. is a federal republic, whereas 

In [26]:
df.to_csv(file_path, index=False, mode='w')

In [27]:
df

Unnamed: 0,country_pair,prompt_style,model_name,temperature,sim_score_1,sim_score_2,sim_diff,cot_reasoning_1,cot_reasoning_2,p-values
0,U.S.A.-Mexico,cot,llama3_8B,0.001,12,15,-3,"U.S.A. and Mexico share a border, have similar...","Mexico and the United States share a border, w...",[]
0,U.S.S.R.-Poland,cot,llama3_8B,0.001,10,4,6,U.S.S.R. and Poland were both Eastern European...,Poland and U.S.S.R. (now Russia) have a comple...,[]
0,China-Albania,cot,llama3_8B,0.001,2,0,2,China and Albania are two countries with vastl...,Albania and China are two vastly different cou...,[]
0,U.S.A.-Israel,cot,llama3_8B,0.001,12,15,-3,"U.S.A. and Israel share some similarities, suc...","Israel and USA share many similarities, includ...",[]
0,Japan-Philippines,cot,llama3_8B,0.001,8,8,0,Japan and the Philippines share some cultural ...,Philippines and Japan share some cultural simi...,[]
...,...,...,...,...,...,...,...,...,...,...
0,England-Jordan,cot,gpt-4,0.500,2,2,0,[England and Jordan are two very different cou...,[Jordan and England are very different in term...,[]
0,France-Israel,cot,gpt-4,0.500,7,5,2,"[France and Israel, although both developed co...","[Israel and France have different languages, c...",[]
0,U.S.A.-Germany,cot,gpt-4,0.500,10,12,-2,[USA and Germany are both developed western co...,[Germany and the U.S.A. are both developed cou...,[]
0,U.S.S.R.-Syria,cot,gpt-4,0.500,8,5,3,[The USSR and Syria are both countries that ha...,[Syria and the U.S.S.R. have some similarities...,[]
