In [1]:
import os
import re
import gc
import json
import torch
import pandas as pd
from getpass import getpass
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama, HuggingFaceHub, HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
torch.cuda.empty_cache()
gc.collect()
# print(GPUtil.showUtilization())
# OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = 'sk-proj-Y2nqeGoOCxTZnITAzuFdT3BlbkFJf6Rm2tmkcssXIov3PMFQ'
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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'
os.environ['LANGCHAIN_TRACING_V2']="true"
os.environ['LANGCHAIN_ENDPOINT']="https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY']="ls__93636df794f14ccba7162354d46779d8"
os.environ['LANGCHAIN_PROJECT']="LLM_Context_Effects"

In [2]:
model_hf_repo_id_mapping = {
    'mistral_7B': "mistralai/Mistral-7B-Instruct-v0.2",
    'llama2_7B':'meta-llama/Llama-2-7b-chat-hf',
    'llama2_13B': 'meta-llama/Llama-2-13b-chat-hf',
    'llama2_70B': 'meta-llama/Llama-2-70b-chat-hf',
    'llama3_8B':'meta-llama/Meta-Llama-3-8B-Instruct',
    'llama3_70B': 'meta-llama/Meta-Llama-3-70B-Instruct'
}

model_ollama_id_mapping = {
    'mistral_7B': "mistral:7b-instruct-fp16",
    'llama2_7B': 'llama2:7b-chat-fp16',
    'llama2_13B': 'llama2:13b-chat-fp16',
    'llama2_70B': 'llama2:70b-chat-fp16',
    'llama3_8B':'llama3:8b-instruct-fp16',
    'llama3_70B': 'llama3:70b-instruct-fp16'
}

model_name_type_mapping={
    'gpt-3.5-turbo': 'openai',
    'gpt-4': 'openai',
    'mistral_7B': 'open-source',
    'llama2_7B': 'open-source',
    'llama2_13B': 'open-source',
    'llama2_70B': 'open-source',
    'llama3_8B': 'open-source',
    'llama3_70B': 'open-source',
}

def initialise_openai_models(model_name, temperature):
    model = ChatOpenAI(model=model_name, api_key=os.environ["OPENAI_API_KEY"], temperature=temperature, max_tokens=200)
    return model

def initialise_open_source_models_transformers(model_name, temperature):
    # Use a pipeline as a high-level helper
    repo_id = model_hf_repo_id_mapping[model_name]
    pipe = pipeline("text-generation",
                    model=repo_id,
                    token=os.environ['HUGGINGFACEHUB_API_TOKEN'],
                    device_map = "sequential", max_new_tokens = 10,
                    do_sample = True,
                    return_full_text = False,
                    temperature = temperature,
                    top_k = 50,
                    top_p = 0.9)
    return  HuggingFacePipeline(pipeline=pipe)

def initialise_open_source_models_ollama(model_name, temperature):
    ollama_id = model_ollama_id_mapping[model_name]
    model = Ollama(base_url='http://localhost:11434',
    model=ollama_id, temperature = temperature, num_predict = 250, format = 'json', num_gpu=-1)
    print(model)
    return model


def initialise_models(model_name = 'mistral_7B', model_type = 'openai', temperature= 0.0):
    if model_type == 'openai':
        return initialise_openai_models(model_name, temperature)
    else:
        return initialise_open_source_models_ollama(model_name, temperature)


In [3]:
template = """Answer the following question to the best of your knowledge. Your answer should be a json of shape provided
Text: {text}
"""
modified_template = """Answer the following question to the best of your knowledge.  Your answer should be a json of shape provided.
                                Text: {text}
                                Please provide an integer score.
                     """
template_cot = """Answer the following question to the best of your knowledge. Your answer should be a json of shape provided. Also, mention how you arrived at the score.
Text: {text}
Lets think step by step.
"""
prompt = ChatPromptTemplate.from_template(template)
modified_prompt = ChatPromptTemplate.from_template(modified_template)
prompt_cot = ChatPromptTemplate.from_template(template_cot)

In [4]:
temperatures = [0.001, 0.5, 1.0, 1.5]
similarity_effect_country_pairs = [
('U.S.A.', 'Mexico'),
('U.S.S.R.', 'Poland'),
('China', 'Albania'),
('U.S.A.', 'Israel'),
('Japan', 'Philippines'),
('U.S.A.', 'Canada'),
('U.S.S.R.', 'Israel'),
('England', 'Ireland'),
('Germany', 'Austria'),
('U.S.S.R.', 'France'),
('Belgium', 'Luxembourg'),
('U.S.A.', 'U.S.S.R.'),
('China', 'North Korea'),
('India', 'Sri Lanka'),
('U.S.A.', 'France'),
('U.S.S.R.', 'Cuba'),
('England', 'Jordan'),
('France', 'Israel'),
('U.S.A.', 'Germany'),
('U.S.S.R.', 'Syria'),
('France', 'Algeria')]

questions_order_1 = {}
questions_order_2 = {}

questions_order_similar_to_1 = {}
questions_order_similar_to_2 = {}

questions_order_similar_degree_1 = {}
questions_order_similar_degree_2 = {}


for country1, country2 in similarity_effect_country_pairs:
    order_1 = f'{country1}-{country2}'
    order_2 = f'{country2}-{country1}'
    questions_order_1[order_1] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, how similar are {country1} and {country2}? Shape: score: int"
    questions_order_2[order_2] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, how similar are {country2} and {country1}? Shape: score: int"
    questions_order_similar_to_1[order_1] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is {country1} similar to {country2}? Shape: score: int"
    questions_order_similar_to_2[order_2] = f"On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is {country2} similar to {country1}? Shape: score: int"

    questions_order_similar_degree_1[order_1] = f"""On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, assess the degree to which {country1} similar to {country2}?
    Please reason in 100 words about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int"""
    questions_order_similar_degree_2[order_2] = f"""On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, assess the degree to which {country2} similar to {country1}? 
    Please reason about your score before providing it.
    Structure your response in the following json format:
    Reasoning: [...]
    score: int"""

# print(questions_order_similar_degree_2)


In [5]:
results_dict_columns = {
    'country_pair': '',
    'prompt_style': 'cot',
    'model_name': '',
    'temperature': '',
    'sim_score_1': [],
    'sim_score_2': [],
    'sim_diff': [],
    'cot_reasoning_1': '',
    'cot_reasoning_2': '',
    'p-values': []
}
# Define the file path
file_path = './results/results_cot_prompt_similar_degree.csv'

# Check if the file exists
if os.path.isfile(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
else:
    # Create an empty DataFrame from the dictionary
    df = pd.DataFrame(columns=results_dict_columns)

# Print the DataFrame
print(df)

          country_pair prompt_style  model_name  temperature  sim_score_1  \
0        U.S.A.-Mexico          cot   llama3_8B        0.001         15.0   
1      U.S.S.R.-Poland          cot   llama3_8B        0.001         12.0   
2        China-Albania          cot   llama3_8B        0.001          4.0   
3        U.S.A.-Israel          cot   llama3_8B        0.001         12.0   
4    Japan-Philippines          cot   llama3_8B        0.001          8.0   
..                 ...          ...         ...          ...          ...   
142     England-Jordan          cot  llama2_70B        0.001          6.0   
143      France-Israel          cot  llama2_70B        0.001         12.0   
144     U.S.A.-Germany          cot  llama2_70B        0.001         18.0   
145     U.S.S.R.-Syria          cot  llama2_70B        0.001          NaN   
146     France-Algeria          cot  llama2_70B        0.001          NaN   

     sim_score_2  sim_diff                                    cot_reasoning

In [6]:
def create_chain(prompt, model, order, model_name, temperature):
    return (prompt | model | StrOutputParser()).with_config({
"metadata": {
    'country-pair-order': order,
    'model_name':model_name,
    'temperature': temperature,
}}
)

def get_output(prompt, model, order, model_name, temperature, ques):
    chain = create_chain(prompt, model, order, model_name, temperature)
    return chain.invoke({"text": ques})

def parse_numeric_output(raw_output):
    match = re.search(r'\d+', raw_output)
    # print(match, match.group())
    if match:
        return match.group()
    return None

def parse_cot_json_output(raw_output):
    raw_output = raw_output.replace("\n", " ")
    # json_dict = json.loads(raw_output)
    score = reasoning = None
    try:
        json_dict = json.loads(raw_output)
    except:
        print(f'error in parsing raw output to json. output is {raw_output}')
        return reasoning, score, True

    if 'score' in json_dict.keys():
        score = json_dict['score']

    if 'Reasoning' in json_dict.keys():
        reasoning = json_dict['Reasoning']

    return reasoning, score, False

    

In [7]:
models = [ 
    'mistral_7B',
 'llama2_7B',
  'llama3_8B',
 'llama2_13B',
  'gpt-3.5-turbo',
 'gpt-4'
]
models = ['llama3_70B']
for model_name in models:
    model_type = model_name_type_mapping[model_name]
    results_dict_columns['model_name'] = model_name
    for temperature in temperatures[:1]:
        model = initialise_models(model_name, model_type, temperature)
        results_dict_columns['temperature'] = temperature
        for order_1, order_2 in zip(questions_order_similar_to_1, questions_order_similar_to_2):
            results_dict_columns['country_pair'] = order_1
            ques_1 = questions_order_similar_degree_1[order_1]
            ques_2 = questions_order_similar_degree_2[order_2]
            output_1 = get_output(prompt, model, order_1, model_name, temperature, ques_1)
            output_2 = get_output(prompt, model, order_2, model_name, temperature, ques_2)
            # print(output_1)
            reasoning_1, sim_score_1, error_1 = parse_cot_json_output(output_1)
            reasoning_2, sim_score_2, error_2 = parse_cot_json_output(output_2)
            # if error_1 or error_2:
            #     df.to_csv(file_path, index=False, mode='w')
            if sim_score_1!=None and sim_score_2!=None:
                sim_diff = sim_score_1 - sim_score_2
            else:
                sim_diff = None
            print(f'for Model_name: {model_name}, Pair: {order_2}, Temperature: {temperature}, output1: {output_1}, output2: {output_2}')
            results_dict_columns['sim_score_1'] = sim_score_1
            results_dict_columns['sim_score_2'] = sim_score_2
            results_dict_columns['sim_diff'] = sim_diff
            results_dict_columns['cot_reasoning_1'] = reasoning_1
            results_dict_columns['cot_reasoning_2'] = reasoning_2

            df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])
            # del model
        # print('model deleted..')
        # gc.collect()
        # torch.cuda.empty_cache()
                
                

[1mOllama[0m
Params: {'model': 'llama3:70b-instruct-fp16', 'format': 'json', 'options': {'mirostat': None, 'mirostat_eta': None, 'mirostat_tau': None, 'num_ctx': None, 'num_gpu': -1, 'num_thread': None, 'num_predict': 250, 'repeat_last_n': None, 'repeat_penalty': None, 'temperature': 0.001, 'stop': None, 'tfs_z': None, 'top_k': None, 'top_p': None}, 'system': None, 'template': None, 'keep_alive': None}
for Model_name: llama3_70B, Pair: Mexico-U.S.A., Temperature: 0.001, output1: {
"Reasoning": "Assessing the similarity between the U.S.A. and Mexico is a complex task, as both countries share some cultural and historical ties but also have distinct differences. Both nations are located in North America, share a border, and have a significant amount of trade and tourism exchange. They also share some cultural similarities, such as a strong emphasis on family and community. However, they have different languages, with English being predominant in the U.S.A. and Spanish in Mexico. Additio

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


error in parsing raw output to json. output is { "Reasoning": "Albania and China are two countries with distinct cultural, historical, and economic backgrounds. Albania is a small country located in Southeastern Europe, while China is a large country in East Asia. They have different languages, religions, and customs. Albania has a Mediterranean climate, whereas China has a diverse range of climates across its vast territory.  However, both countries share some similarities. Both Albania and China have a rich cultural heritage, with ancient civilizations and historical landmarks such as the Butrint amphitheater in Albania and the Great Wall of China. They also share a similar experience of having been isolated from the rest of the world for a significant period, with Albania being under communist rule until 1990 and China being closed off to the world until its economic reforms in the late 1970s.  In terms of economy, both countries have experienced rapid growth in recent years, althou

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


error in parsing raw output to json. output is { "Reasoning": "I assessed the similarity between Israel and the USA by considering various factors such as political systems, economies, cultures, histories, and geographic locations. Both countries are democracies with a strong emphasis on individual freedoms and human rights. They share similar economic systems, with a mix of private enterprise and government regulation. Additionally, both countries have a significant service sector and are leaders in technological innovation.  Culturally, Israel and the USA share many similarities, with a strong emphasis on education, family values, and community involvement. Both countries also have a diverse population with a significant immigrant presence. Historically, both countries were founded by pioneers who sought to create a new nation based on democratic principles.  However, there are also some significant differences between the two countries. Israel is a much smaller country with a more h

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


for Model_name: llama3_70B, Pair: Philippines-Japan, Temperature: 0.001, output1: {
"Reasoning": "Japan and the Philippines share some cultural and historical similarities, but they also have many differences. Both countries are archipelagos with a strong emphasis on family and social hierarchy. They also share a history of colonialism, with Japan having been occupied by the US after WWII and the Philippines having been a US territory from 1898 to 1946. However, their languages, cuisines, and religious beliefs differ significantly. Japan is a highly industrialized country with a distinct culture shaped by its isolationist past, while the Philippines is a developing country with a more Western-influenced culture. Considering these factors, I would rate their similarity as moderate.",
"score": 12
}, output2: {
"Reasoning": "The Philippines and Japan share some cultural and historical similarities, but they also have many differences. Both countries are archipelagos with a strong emphasis

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


for Model_name: llama3_70B, Pair: Austria-Germany, Temperature: 0.001, output1: {
"Reasoning": "Germany and Austria share a rich cultural heritage, having been part of the same empire (Austro-Hungarian Empire) until World War I. They both speak German as an official language, with many dialects being mutually intelligible. The two countries also have similar customs, traditions, and architectural styles, particularly in the Alpine regions. Additionally, they share a border and have strong economic ties. However, Austria has a distinct identity shaped by its history of imperial grandeur, whereas Germany's identity is more complex due to its tumultuous 20th-century past.",
"score": 16
}, output2: {
"Reasoning": "Austria and Germany share many cultural, historical, and linguistic similarities due to their common Germanic heritage and geographic proximity. Both countries were part of the Holy Roman Empire and later the Austro-Hungarian Empire, which has resulted in similar architectural st

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


for Model_name: llama3_70B, Pair: Luxembourg-Belgium, Temperature: 0.001, output1: {
"Reasoning": "Belgium and Luxembourg share many cultural, historical, and geographical similarities. Both countries are located in Western Europe, share a border, and have similar languages (Dutch, French, and German). They also have similar economic systems, with strong service sectors and high standards of living. Additionally, both countries have a rich history, with many medieval castles and fortifications. However, they also have some differences, such as Luxembourg's unique multilingualism and its independent grand duchy status. Considering these factors, I would rate their similarity as 16 out of 20.",
"score": 16
}, output2: {
"Reasoning": "Luxembourg and Belgium share many cultural, historical, and geographical similarities. Both countries are located in Western Europe, share a border, and have a similar climate. They also have a shared history, with both being part of the Benelux economic uni

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


error in parsing raw output to json. output is { "Reasoning": "Jordan and England are two countries with distinct cultural, historical, and geographical backgrounds. However, they do share some similarities. Both countries have a rich history, with Jordan being home to ancient cities like Petra and England having a plethora of medieval castles and historic landmarks. They also both have a strong sense of national identity and pride in their heritage.  In terms of politics, both countries are monarchies, although England is part of the UK whereas Jordan is an independent kingdom. Additionally, both countries have a relatively high standard of living and are considered to be upper-middle-income economies.  However, there are also significant differences between the two countries. Jordan is a predominantly Arab and Muslim country, whereas England is a predominantly Christian country with a more diverse population. The climate, geography, and natural resources of the two countries are also

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


error in parsing raw output to json. output is { "Reasoning": "Assessing the similarity between Israel and France requires considering various aspects such as culture, history, politics, economy, and geography. While both countries share some commonalities, they also have distinct differences.  Similarities:  * Both Israel and France are democracies with a strong emphasis on human rights and freedoms. * They share a rich cultural heritage, with Israel being the birthplace of Judaism and France having a significant Jewish population. * Both countries have a high standard of living and are considered developed economies. * They both have a strong focus on education, innovation, and technology.  Differences:  * Geography: Israel is a small country in the Middle East, while France is a large country in Western Europe. This difference significantly impacts their climate, natural resources, and strategic interests. * History: Israel has a complex and tumultuous history, with many conflicts a

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


for Model_name: llama3_70B, Pair: Germany-U.S.A., Temperature: 0.001, output1: {
"Reasoning": "Assessing the similarity between the U.S.A. and Germany is a complex task, as both countries have unique cultural, historical, and economic characteristics. However, they also share some commonalities. Both are developed economies with strong democratic institutions, a high standard of living, and a significant presence in global affairs. They also share a commitment to human rights, freedom of speech, and the rule of law. Additionally, both countries have a rich cultural heritage, with a strong emphasis on education, science, and innovation. Nevertheless, there are also significant differences, such as language, history, and cuisine, that set them apart.",
"score": 14
}, output2: {
"Reasoning": "Germany and the USA share many cultural, economic, and political similarities due to their shared Western values and historical ties. Both countries are developed democracies with a strong emphasis o

  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


for Model_name: llama3_70B, Pair: Algeria-France, Temperature: 0.001, output1: {
"Reasoning": "France and Algeria share a complex history, with Algeria being a French colony from 1830 to 1962. This colonial past has left a lasting impact on Algerian culture, language, and politics. Many Algerians speak French as a second language, and French is still an official language in Algeria. Additionally, both countries have similar Mediterranean climates and share some cultural practices, such as a strong emphasis on family and hospitality. However, Algeria has also been influenced by its Arab and Berber heritage, which sets it apart from France. Furthermore, the two countries have distinct political systems and economies.",
"score": 12
}, output2: {
"Reasoning": "Algeria and France share a complex history, with Algeria being a French colony from 1830 to 1962. As a result, there are significant cultural, linguistic, and architectural similarities between the two countries. Many Algerians speak

In [8]:
df.to_csv(file_path, index=False, mode='w')

In [9]:
del model
print('model deleted..')
gc.collect()
torch.cuda.empty_cache()

model deleted..
