In [2]:
import os
import re
import gc
import json
import torch
import pandas as pd
from getpass import getpass
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama, HuggingFaceHub, HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
torch.cuda.empty_cache()
gc.collect()
# print(GPUtil.showUtilization())
# OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = 'sk-proj-Y2nqeGoOCxTZnITAzuFdT3BlbkFJf6Rm2tmkcssXIov3PMFQ'
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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'
os.environ['LANGCHAIN_TRACING_V2']="true"
os.environ['LANGCHAIN_ENDPOINT']="https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY']="ls__93636df794f14ccba7162354d46779d8"
os.environ['LANGCHAIN_PROJECT']="LLM_Context_Effects"

In [3]:
model_hf_repo_id_mapping = {
    'mistral_7B': "mistralai/Mistral-7B-Instruct-v0.2",
    'llama2_7B':'meta-llama/Llama-2-7b-chat-hf',
    'llama2_13B': 'meta-llama/Llama-2-13b-chat-hf',
    'llama2_70B': 'meta-llama/Llama-2-70b-chat-hf',
    'llama3_8B':'meta-llama/Meta-Llama-3-8B-Instruct',
    'llama3_70B': 'meta-llama/Meta-Llama-3-70B-Instruct'
}

model_ollama_id_mapping = {
    'mistral_7B': "mistral:7b-instruct-fp16",
    'llama2_7B': 'llama2:7b-chat-fp16',
    'llama2_13B': 'llama2:13b-chat-fp16',
    'llama2_70B': 'llama2:70b-chat-fp16',
    'llama3_8B':'llama3:8b-instruct-fp16',
    'llama3_70B': 'llama3:70b-instruct-fp16'
}

model_name_type_mapping={
    'gpt-3.5-turbo': 'openai',
    'gpt-4': 'openai',
    'mistral_7B': 'open-source',
    'llama2_7B': 'open-source',
    'llama2_13B': 'open-source',
    'llama2_70B': 'open-source',
    'llama3_8B': 'open-source',
    'llama3_70B': 'open-source',
}

def initialise_openai_models(model_name, temperature):
    model = ChatOpenAI(model=model_name, api_key=os.environ["OPENAI_API_KEY"], temperature=temperature, max_tokens=200)
    return model

def initialise_open_source_models_transformers(model_name, temperature):
    # Use a pipeline as a high-level helper
    repo_id = model_hf_repo_id_mapping[model_name]
    pipe = pipeline("text-generation",
                    model=repo_id,
                    token=os.environ['HUGGINGFACEHUB_API_TOKEN'],
                    device_map = "sequential", max_new_tokens = 10,
                    do_sample = True,
                    return_full_text = False,
                    temperature = temperature,
                    top_k = 50,
                    top_p = 0.9)
    return  HuggingFacePipeline(pipeline=pipe)

def initialise_open_source_models_ollama(model_name, temperature):
    ollama_id = model_ollama_id_mapping[model_name]
    model = Ollama(base_url='http://localhost:11434',
    model=ollama_id, temperature = temperature, num_predict = 10, format = 'json', num_gpu=-1)
    print(model)
    return model


def initialise_models(model_name = 'mistral_7B', model_type = 'openai', temperature= 0.0):
    if model_type == 'openai':
        return initialise_openai_models(model_name, temperature)
    else:
        return initialise_open_source_models_ollama(model_name, temperature)


In [4]:
template = """Answer the following question to the best of your knowledge. Your answer should be a json of shape provided.
Text: {text}
"""
modified_template = """Answer the following question to the best of your knowledge.  Your answer should be a json of shape provided.
                                Text: {text}
                                Please provide an integer score.
                     """
template_cot = """Answer the following question to the best of your knowledge. Your answer should be a json of shape provided. Also, mention how you arrived at the score.
Text: {text}
Lets think step by step.
"""
prompt = ChatPromptTemplate.from_template(template)
modified_prompt = ChatPromptTemplate.from_template(modified_template)
prompt_cot = ChatPromptTemplate.from_template(template_cot)

In [29]:
temperatures = [0.001, 0.5, 1.0, 1.5]
similarity_effect_country_pairs = [
('U.S.A.', 'Mexico'),
('U.S.S.R.', 'Poland'),
('China', 'Albania'),
('U.S.A.', 'Israel'),
('Japan', 'Philippines'),
('U.S.A.', 'Canada'),
('U.S.S.R.', 'Israel'),
('England', 'Ireland'),
('Germany', 'Austria'),
('U.S.S.R.', 'France'),
('Belgium', 'Luxembourg'),
('U.S.A.', 'U.S.S.R.'),
('China', 'North Korea'),
('India', 'Sri Lanka'),
('U.S.A.', 'France'),
('U.S.S.R.', 'Cuba'),
('England', 'Jordan'),
('France', 'Israel'),
('U.S.A.', 'Germany'),
('U.S.S.R.', 'Syria'),
('France', 'Algeria')]

SSA_order_1 = {}
SSA_order_2 = {}

SST_order_1 = {}
SST_order_2 = {}

SSD_order_1 = {}
SSD_order_2 = {}


for country1, country2 in similarity_effect_country_pairs:
    order_1 = f'{country1}-{country2}'
    order_2 = f'{country2}-{country1}'
    SSA_order_1[order_1] = f"""
    On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, how similar are {country1} and {country2}?
    Please rate overall similarity and base your judgement on the following factors:
    1. Economy
    2. Politics and Governance
    3. Culture, Religion and Ethnicity
    4. History
    5. Geography and Borders
    6. International Relations and Influence
    7. Defense and Military Conflict

    Structure your response in the following json format:
    score: int"""


    SSA_order_2[order_2] = f"""
    On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, how similar are {country2} and {country1}?
    Please rate overall similarity and base your judgement on the following factors:
    1. Economy
    2. Politics and Governance
    3. Culture, Religion and Ethnicity
    4. History
    5. Geography and Borders
    6. International Relations and Influence
    7. Defense and Military Conflict

    Structure your response in the following json format:
    score: int"""

    SST_order_1[order_1] = f"""
    On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is {country1} similar to {country2}?
    Please rate overall similarity and base your judgement on the following factors:
    1. Economy
    2. Politics and Governance
    3. Culture, Religion and Ethnicity
    4. History
    5. Geography and Borders
    6. International Relations and Influence
    7. Defense and Military Conflict

    Structure your response in the following json format:
    score: int"""


    SST_order_2[order_2] = f"""
    On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, is {country2} similar to {country1}?
    Please rate overall similarity and base your judgement on the following factors:
    1. Economy
    2. Politics and Governance
    3. Culture, Religion and Ethnicity
    4. History
    5. Geography and Borders
    6. International Relations and Influence
    7. Defense and Military Conflict

    Structure your response in the following json format:
    score: int"""

    SSD_order_1[order_1] = f"""On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, assess the degree to which {country1} is similar to {country2}?
    Please rate overall similarity and base your judgement on the following factors:
    1. Economy
    2. Politics and Governance
    3. Culture, Religion and Ethnicity
    4. History
    5. Geography and Borders
    6. International Relations and Influence
    7. Defense and Military Conflict

    Structure your response in the following json format:
    score: int"""
    SSD_order_2[order_2] = f"""On a scale of 0 to 20, where 0 means no similarity and 20 means complete similarity, assess the degree to which {country2} is similar to {country1}? 
    Please rate overall similarity and base your judgement on the following factors:
    1. Economy
    2. Politics and Governance
    3. Culture, Religion and Ethnicity
    4. History
    5. Geography and Borders
    6. International Relations and Influence
    7. Defense and Military Conflict

    Structure your response in the following json format:
    score: int"""

# print(questions_order_similar_degree_2)


In [30]:
prompt_style_template_mapping = {
    'SD': {'order_1': SSD_order_1, 'order_2': SSD_order_2},
    'ST':{'order_1': SST_order_1, 'order_2': SST_order_2},
    'SA':{'order_1': SSA_order_1, 'order_2': SSA_order_2}
}

In [31]:
results_dict_columns = {
    'country_pair': '',
    'prompt_style': '',
    'model_name': '',
    'temperature': '',
    'sim_score_1': [],
    'sim_score_2': [],
    'sim_diff': [],
}
# Define the file path
file_path = './results/results_single_factor_based.csv'

# Check if the file exists
if os.path.isfile(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
else:
    # Create an empty DataFrame from the dictionary
    df = pd.DataFrame(columns=results_dict_columns)

# Print the DataFrame
print(df)

Empty DataFrame
Columns: [country_pair, prompt_style, model_name, temperature, sim_score_1, sim_score_2, sim_diff]
Index: []


In [32]:
def create_chain(prompt, model, order, model_name, temperature):
    return (prompt | model | StrOutputParser()).with_config({
"metadata": {
    'country-pair-order': order,
    'model_name':model_name,
    'temperature': temperature,
}}
)

def get_output(prompt, model, order, model_name, temperature, ques):
    chain = create_chain(prompt, model, order, model_name, temperature)
    return chain.invoke({"text": ques})

def parse_numeric_output(raw_output):
    match = re.search(r'\d+', raw_output)
    # print(match, match.group())
    if match:
        return match.group()
    return None

def parse_cot_json_output(raw_output):
    raw_output = raw_output.replace("\n", " ")
    # json_dict = json.loads(raw_output)
    score = reasoning = None
    try:
        json_dict = json.loads(raw_output)
    except:
        print(f'error in parsing raw output to json. output is {raw_output}')
        return reasoning, score, True

    if 'score' in json_dict.keys():
        score = json_dict['score']

    if 'Reasoning' in json_dict.keys():
        reasoning = json_dict['Reasoning']

    return reasoning, score, False

    

In [33]:
settings = [('llama3_8B', '0.001', 'SST'), ('llama3_8B', '0.5', 'SST'), ('gpt-4', '0.5', 'SSD') ]
for setting in settings:
    model_name = setting[0]
    temperature = float(setting[1])
    model_type = model_name_type_mapping[model_name]
    results_dict_columns['model_name'] = model_name
    model = initialise_models(model_name, model_type, temperature)
    # print(f'model is {model_name}, temperature is {temperature}')
    results_dict_columns['temperature'] = temperature
    prompt_context = setting[2][0]
    prompt_style = ''.join(setting[2][1:])
    # print(prompt_context)
    # print(prompt_style)
    results_dict_columns['prompt_style'] = prompt_context
    questions_1 = prompt_style_template_mapping[prompt_style]['order_1']
    questions_2 = prompt_style_template_mapping[prompt_style]['order_2']
    for order_1, order_2 in zip(questions_1, questions_2):
        # print(f'order_1: {order_1}')
        results_dict_columns['country_pair'] = order_1
        ques_1 = questions_1[order_1]
        ques_2 = questions_2[order_2]
        # print(f'ques_1: {ques_1}')
        output_1 = get_output(prompt, model, order_1, model_name, temperature, ques_1)
        output_2 = get_output(prompt, model, order_2, model_name, temperature, ques_2)
        parsed_output_1 = parse_numeric_output(output_1)
        if  parsed_output_1:
                sim_score_1 = int(parsed_output_1)
        else:
            output_1 = get_output(modified_prompt, model, order_1, model_name, temperature, ques_1)
            parsed_output_1 = parse_numeric_output(output_1)
            if  parsed_output_1:
                sim_score_1 = int(parsed_output_1)
            else:
                sim_score_1 = None
                print(f' cannot parse output {output_1} for Model_name: {model_name}, Pair: {order_1}, Temperature: {temperature}')
        parsed_output_2 = parse_numeric_output(output_2)
        if parsed_output_2:
            sim_score_2 = int(parsed_output_2)
            
        else:
            output_2 = get_output(modified_prompt, model, order_2, model_name, temperature, ques_2)
            parsed_output_2 = parse_numeric_output(output_2)
            if  parsed_output_2:
                sim_score_2 = int(parsed_output_2)
            else:
                sim_score_2 = None
                print(f' cannot parse output {output_2} for Model_name: {model_name}, Pair: {order_2}, Temperature: {temperature}')
        if sim_score_1!=None and sim_score_2!=None:
            sim_diff = sim_score_1 - sim_score_2
        else:
            sim_diff = None
        print(f'for Model_name: {model_name}, Pair: {order_1}, Temperature: {temperature}, output1: {output_1}, output2: {output_2}')
        results_dict_columns['sim_score_1'] = sim_score_1
        results_dict_columns['sim_score_2'] = sim_score_2
        results_dict_columns['sim_diff'] = sim_diff
        df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


[1mOllama[0m
Params: {'model': 'llama3:8b-instruct-fp16', 'format': 'json', 'options': {'mirostat': None, 'mirostat_eta': None, 'mirostat_tau': None, 'num_ctx': None, 'num_gpu': -1, 'num_thread': None, 'num_predict': 10, 'repeat_last_n': None, 'repeat_penalty': None, 'temperature': 0.001, 'stop': None, 'tfs_z': None, 'top_k': None, 'top_p': None}, 'system': None, 'template': None, 'keep_alive': None}
for Model_name: llama3_8B, Pair: U.S.A.-Mexico, Temperature: 0.001, output1: {
"score": 12
}, output2: {
"score": 12
}


  df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])


for Model_name: llama3_8B, Pair: U.S.S.R.-Poland, Temperature: 0.001, output1: {
"score": 8
}, output2: {
"score": 2
}
for Model_name: llama3_8B, Pair: China-Albania, Temperature: 0.001, output1: {
"score": 2
}, output2: {
"score": 2
}
for Model_name: llama3_8B, Pair: U.S.A.-Israel, Temperature: 0.001, output1: {
"score": 8
}, output2: {
"score": 12
}
for Model_name: llama3_8B, Pair: Japan-Philippines, Temperature: 0.001, output1: {
"score": 8
}, output2: {
"score": 8
}
for Model_name: llama3_8B, Pair: U.S.A.-Canada, Temperature: 0.001, output1: {
"score": 14
}, output2: {
"score": 14
}
for Model_name: llama3_8B, Pair: U.S.S.R.-Israel, Temperature: 0.001, output1: {
"score": 2
}, output2: {
"score": 2
}
for Model_name: llama3_8B, Pair: England-Ireland, Temperature: 0.001, output1: {
"score": 12
}, output2: {
"score": 12
}
for Model_name: llama3_8B, Pair: Germany-Austria, Temperature: 0.001, output1: {
"score": 14
}, output2: {
"score": 12
}
for Model_name: llama3_8B, Pair: U.S.S.R.-Fra

  warn_deprecated(


for Model_name: gpt-4, Pair: U.S.A.-Mexico, Temperature: 0.5, output1: {
"score": {
"Economy": 8,
"Politics and Governance": 7,
"Culture, Religion and Ethnicity": 10,
"History": 12,
"Geography and Borders": 15,
"International Relations and Influence": 10,
"Defense and Military Conflict": 7
}
}, output2: {
    "score": {
        "Economy": 12,
        "Politics and Governance": 10,
        "Culture, Religion and Ethnicity": 8,
        "History": 14,
        "Geography and Borders": 18,
        "International Relations and Influence": 10,
        "Defense and Military Conflict": 9
    }
}
for Model_name: gpt-4, Pair: U.S.S.R.-Poland, Temperature: 0.5, output1: {
"score": 7
}, output2: {
"score": {
"Economy": 6,
"Politics and Governance": 5,
"Culture, Religion and Ethnicity": 7,
"History": 10,
"Geography and Borders": 8,
"International Relations and Influence": 5,
"Defense and Military Conflict": 6
}
}
for Model_name: gpt-4, Pair: China-Albania, Temperature: 0.5, output1: {
"score": {
"Ec

In [38]:
df.head(62)

Unnamed: 0,country_pair,prompt_style,model_name,temperature,sim_score_1,sim_score_2,sim_diff
0,U.S.A.-Mexico,S,llama3_8B,0.001,12,12,0
0,U.S.S.R.-Poland,S,llama3_8B,0.001,8,2,6
0,China-Albania,S,llama3_8B,0.001,2,2,0
0,U.S.A.-Israel,S,llama3_8B,0.001,8,12,-4
0,Japan-Philippines,S,llama3_8B,0.001,8,8,0
...,...,...,...,...,...,...,...
0,U.S.S.R.-Cuba,S,gpt-4,0.500,10,15,-5
0,England-Jordan,S,gpt-4,0.500,5,5,0
0,France-Israel,S,gpt-4,0.500,7,10,-3
0,U.S.A.-Germany,S,gpt-4,0.500,15,15,0


In [None]:
models = [ 
    'mistral_7B',
 'llama2_7B',
  'llama3_8B',
 'llama2_13B',
  'gpt-3.5-turbo',
 'gpt-4'
]
models = ['llama3_70B']
for model_name in models:
    model_type = model_name_type_mapping[model_name]
    results_dict_columns['model_name'] = model_name
    for temperature in temperatures[:1]:
        model = initialise_models(model_name, model_type, temperature)
        results_dict_columns['temperature'] = temperature
        for order_1, order_2 in zip(questions_order_similar_to_1, questions_order_similar_to_2):
            results_dict_columns['country_pair'] = order_1
            ques_1 = questions_order_similar_degree_1[order_1]
            ques_2 = questions_order_similar_degree_2[order_2]
            output_1 = get_output(prompt, model, order_1, model_name, temperature, ques_1)
            output_2 = get_output(prompt, model, order_2, model_name, temperature, ques_2)
            # print(output_1)
            reasoning_1, sim_score_1, error_1 = parse_cot_json_output(output_1)
            reasoning_2, sim_score_2, error_2 = parse_cot_json_output(output_2)
            # if error_1 or error_2:
            #     df.to_csv(file_path, index=False, mode='w')
            if sim_score_1!=None and sim_score_2!=None:
                sim_diff = sim_score_1 - sim_score_2
            else:
                sim_diff = None
            print(f'for Model_name: {model_name}, Pair: {order_2}, Temperature: {temperature}, output1: {output_1}, output2: {output_2}')
            results_dict_columns['sim_score_1'] = sim_score_1
            results_dict_columns['sim_score_2'] = sim_score_2
            results_dict_columns['sim_diff'] = sim_diff
            results_dict_columns['cot_reasoning_1'] = reasoning_1
            results_dict_columns['cot_reasoning_2'] = reasoning_2

            df = pd.concat([df, pd.DataFrame.from_dict([results_dict_columns])])
            # del model
        # print('model deleted..')
        # gc.collect()
        # torch.cuda.empty_cache()
                
                

In [8]:
df.to_csv(file_path, index=False, mode='w')

In [9]:
del model
print('model deleted..')
gc.collect()
torch.cuda.empty_cache()

model deleted..
