In [None]:
import requests, uuid, json
import random
from tqdm import tqdm 
from api_keys import msft_key 

In [None]:
# Add your key and endpoint
key = msft_key
endpoint = "https://api.cognitive.microsofttranslator.com"

# location, also known as region.
# required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.
location = "eastus"

path = '/translate'
constructed_url = endpoint + path

def set_header(country=None):
    country_lang_mapping = {"United States":"en",
                        "India":'hi',
                        "China":'zh-Hans',
                        "Iran":'fa',
                        "Kenya":'sw',}
    
    lang_list = ['en', 'zh-Hans',"hi","fa","sw",]
    from_lang = 'en'
    
    if country:
        to_lang = country_lang_mapping[country]
    else:
        to_lang = lang_list
    
    lang_list.remove(from_lang)

    params = {
        'api-version': '3.0',
        'from': from_lang,
        'to': to_lang,
    }

    headers = {
        'Ocp-Apim-Subscription-Key': key,
        # location required if you're using a multi-service or regional (not global) resource.
        'Ocp-Apim-Subscription-Region': location,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4()),
    }
    return params, headers


def get_assertions(num_samples=1000):
    # Read the assertion file and build the prompt
    with open("../data/genericskb/genericskb_assertion.txt", 'r') as f:
        data = f.readlines()
        data = [l.strip() for l in data]
        # Randomly sample 1000 assertions
        random.seed(42)
        data = random.choices(data, k=num_samples)
    return data


def translate(params, headers, text):
    body = [{'text': text}] 
    request = requests.post(constructed_url, params=params, headers=headers, json=body)
    response = request.json()
    return response


In [None]:
params, headers = set_header()
data = get_assertions(500)


In [None]:
translated_assertions = [] 
for assertion in tqdm(data):
    body = [{'text': assertion}] 
    request = requests.post(constructed_url, params=params, headers=headers, json=body)
    response = request.json()
    translated_assertions.append(response[0]['translations']+[{"text": assertion, "to":'en'}])

In [None]:
# Get samples with a format of {lang: country[{prompt, answer},]} 
# Task1: Verify if the assertion is correct in a certain culture.
# Task2: Let the model select the plausible country for the assertion.
inputs_lang_country = {}
country_lang_mapping = {"the United States":"en",
                    "India":'hi',
                    "China":'zh-Hans',
                    "Iran":'fa',
                    "Kenya":'sw',}
lang_country_mapping = {v: k for k, v in country_lang_mapping.items()}

for lang, country in lang_country_mapping.items():
    inputs_lang_country[lang] = {}
    inputs_lang_country[lang][country] = []

for assertion_lang in translated_assertions:
    for assertion in assertion_lang:
        lang = assertion['to']
        country = lang_country_mapping[lang]
        inputs_lang_country[lang][country].append({"prompt": assertion['text'], "answer": ""})


with open("../data/genericskb/inputs_lang_country_base.json", 'w', encoding='utf-8') as json_file:
    json.dump(inputs_lang_country, json_file, ensure_ascii=False, indent=4)



## Add prompt for assertion verification 

In [None]:
inputs_lang_country['en']['the United States'][0]

In [None]:
params, headers = set_header() 
translate(params, headers, 'True')

In [None]:
# Add the prompt for all countries in English
prompt_task = {'zh-Hans':"这在中国是真是假？",
        'en':'Is this true or false in the United States?',
        'hi': 'भारत में यह सच है या झूठ?',
        'fa': 'ایا این در ایران درست است یا غلط؟',
        'sw': 'Je, hii ni kweli au ni uongo nchini Kenya?',
        }

prompt_format = {'zh-Hans':"以 {“答案”： } 的 json 格式回答。",
            'en':"Answer in the json format of {\"Answer\": }.",
            'hi':"{\"उत्तर\": } के json स्वरूप में उत्तर दें।",
            'fa': "پاسخ در قالب json {\"پاسخ\": }.",
            'sw':"Jibu katika umbizo la json la {\"Answer\": }.",
        }

prompt_answer = {'zh-Hans':'真',
            'en':"True",
            'hi':'सच्चा',
            'fa': 'واقعی',
            'sw': 'Kweli',
        }


In [None]:
with open("../data/genericskb/inputs_lang_country_base.json", 'r', encoding='utf-8') as json_file:
    inputs_lang_country_base = json.load(json_file)


inputs_lang_country = {}
inputs_lang_country['en'] = {}

# Add the prompt to all countries in English 
for country,lang in country_lang_mapping.items():
    inputs_lang_country['en'][country] = []
    for assertion in inputs_lang_country_base['en']['the United States']:
        prompt = assertion['prompt'] + " Is this True or False in {}? ".format(country) + prompt_format['en']
        answer = "True"
        inputs_lang_country['en'][country].append({'prompt': prompt, 'answer': answer})

# Add the prompt to all countries in native language
#  
for country,lang in country_lang_mapping.items():
    if lang == 'en':
        continue
    inputs_lang_country[lang] = {}
    inputs_lang_country[lang][country] = []
    for assertion in inputs_lang_country_base[lang][country]:
        prompt = assertion['prompt'] + ' ' +prompt_task[lang] + ' ' + prompt_format[lang]
        answer = prompt_answer[lang]
        inputs_lang_country[lang][country].append({'prompt': prompt, 'answer': answer})


In [None]:
with open("../data/genericskb/inputs_lang_country_verification.json", 'w', encoding='utf-8') as json_file:
    json.dump(inputs_lang_country, json_file, ensure_ascii=False, indent=4)

## Add prompt for association

In [None]:
translate(params, headers, "Kenya")

In [None]:
prompt_task = {'zh-Hans':'上一句描述了在以下哪个国家/地区发生的事情？伊朗、中国、美国、印度、肯尼亚。',
        'en':"The previous sentence describes things that happen in which one of the following countries? Iran, China, the United States, India, Kenya.",
        'hi': 'पिछला वाक्य निम्नलिखित में से किस एक देश में होने वाली चीजों का वर्णन करता है? ईरान, चीन, संयुक्त राज्य अमेरिका, भारत, केन्या।',
        'fa': 'جمله قبلی چیزهایی را توصیف می کند که در کدام یک از کشورهای زیر اتفاق می افتد؟ ایران، چین، ایالات متحده، هند، کنیا.',
        'sw': 'Sentensi ya awali inaelezea mambo ambayo hutokea katika moja ya nchi zifuatazo? Iran, China, Marekani, India, Kenya.',
        }

prompt_format = {'zh-Hans':"以 {“答案”： } 的 json 格式回答。",
            'en':"Answer in the json format of {\"Answer\": }.",
            'hi':"{\"उत्तर\": } के json स्वरूप में उत्तर दें।",
            'fa': "پاسخ در قالب json {\"پاسخ\": }.",
            'sw':"Jibu katika umbizo la json la {\"Answer\": }.",
        }

prompt_answer = {'zh-Hans':'中国',
            'en':"the United States",
            'hi':'भारत',
            'fa': 'ایران',
            'sw': 'Kenya',
        }


In [None]:
with open("../data/genericskb/inputs_lang_country_base.json", 'r', encoding='utf-8') as json_file:
    inputs_lang_country_base = json.load(json_file)


inputs_lang_country = {}
inputs_lang_country['en'] = {}

# Add the prompt to all countries in native language
for country,lang in country_lang_mapping.items():
    inputs_lang_country[lang] = {}
    inputs_lang_country[lang][country] = []
    for assertion in inputs_lang_country_base[lang][country]:
        prompt = assertion['prompt'] + ' ' +prompt_task[lang] + ' ' + prompt_format[lang]
        answer = prompt_answer[lang]
        inputs_lang_country[lang][country].append({'prompt': prompt, 'answer': answer})

with open("../data/genericskb/inputs_lang_country_association.json", 'w', encoding='utf-8') as json_file:
    json.dump(inputs_lang_country, json_file, ensure_ascii=False, indent=4)
