In [6]:
from eventregistry import *
import json, os, sys
import os
from eventregistry import EventRegistry
from openai import OpenAI
from tqdm import tqdm



In [7]:
api_key = os.getenv("NEWSAPI_API_KEY")
er = EventRegistry(apiKey=api_key, allowUseOfArchive=False)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

### English article collection from India

In [3]:
lang = "eng"
topic = "India_election_2024"
query = {
  "$query": {
    "$and": [
      {
        "keyword": "india",
        "keywordLoc": "body"
      },
      {
        "keyword": "election",
        "keywordLoc": "body"
      },
      {
        "keyword": "2024",
        "keywordLoc": "body"
      },
      {
        "sourceLocationUri": "http://en.wikipedia.org/wiki/India"
      },
      {
        "lang": lang
      }
    ]
  },
  "$filter": {
    "isDuplicate": "skipDuplicates"
  }
}
q = QueryArticlesIter.initWithComplexQuery(query)
filename = lang+"_"+topic +".json"
if os.path.exists(filename):
    os.remove(filename)

articles = []
for art in q.execQuery(
    er,
    sortBy="date",
    maxItems=100,
    returnInfo=ReturnInfo(
        sourceInfo=SourceInfoFlags(location=True),
    ),
):
    articles.append(art)

with open(filename, "w") as f:
    json.dump(articles, f, ensure_ascii=False, indent=4)

### Hindi article collection from India

In [5]:
lang = "hin"
topic = "India_election_2024"
query = {
  "$query": {
    "$and": [
      {
        "keyword": "india",
        "keywordLoc": "body"
      },
      {
        "keyword": "election",
        "keywordLoc": "body"
      },
      {
        "keyword": "2024",
        "keywordLoc": "body"
      },
      {
        "sourceLocationUri": "http://en.wikipedia.org/wiki/India"
      },
      {
        "lang": lang
      }
    ]
  },
  "$filter": {
    "isDuplicate": "skipDuplicates"
  }
}
q = QueryArticlesIter.initWithComplexQuery(query)
filename = lang+"_"+topic +".json"
if os.path.exists(filename):
    os.remove(filename)

articles = []
for art in q.execQuery(
    er,
    sortBy="date",
    maxItems=100,
    returnInfo=ReturnInfo(
        sourceInfo=SourceInfoFlags(location=True),
    ),
):
    articles.append(art)

with open(filename, "w") as f:
    json.dump(articles, f, ensure_ascii=False, indent=4)

### OPEN AI Related functions

In [6]:
def QnA(
    art,
    client,
    model,
    QnA_prompt,
    QnA_answer,
):

    prompt_complete = (
        QnA_prompt
        + "\n"
        + art["title"]
        + "\n"
        + art["body"]
        + "\n"
        + "Please choose from the options below:"
        + QnA_answer
    )
    model = model
    messages = [{"role": "user", "content": prompt_complete}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."

In [7]:
def get_LLM_QnA(
    src_filename,
    client,
    model="gpt-4o",
    QnA_prompt="Q: ",
    QnA_answer = "A: ",
):
    with open(src_filename, "r") as f:
        data = f.read()
        des_articles = []
        src_articles = json.loads(data)
        for src_art in tqdm(src_articles):
            src_art["Question"] = QnA_prompt
            src_art["Answers"] = QnA_answer
            src_art["LLM_answer"] = QnA(
                src_art,
                client,
                model=model,
                QnA_prompt=QnA_prompt,
                QnA_answer=QnA_answer,
            )

            des_articles.append(src_art)
    des_filename = "QnA_" + src_filename
    if os.path.exists(des_filename):
        os.remove(des_filename)
    with open(des_filename, "w") as f:
        json.dump(des_articles, f, ensure_ascii=False, indent=4)

### Excel functions

In [8]:
def extract_answer(qna_text):
    # Use a regular expression to find the answer choice (A, B, C, D or E) at the start of the QnA field
    match = re.search(r"\s*([A-E])\)", qna_text)
    if match:
        return match.group(1)
    return None

def extract_answer_from_json(src_filename):
    # Load the JSON data
    with open(src_filename, "r") as file:
        data = json.load(file)

    # Iterate through the records and extract answers
    for record in data:
        LLM_answer = record.get("LLM_answer", "")
        answer = extract_answer(LLM_answer)
        if answer:
            print(f"Extracted answer: {answer}, lang: {record['lang']}")
        else:
            print("No answer extracted")

In [9]:
def count_num_of_answers(src_filename):
    # Load the JSON data
    with open(src_filename, "r") as file:
        data = json.load(file)

    # Iterate through the records and extract answers that can be A, B, C, D or E
    # count the number of different  answers
    answers = {}
    answers["A"] = 0
    answers["B"] = 0
    answers["C"] = 0
    answers["D"] = 0
    answers["E"] = 0
    for record in data:
        LLM_answer = record.get("LLM_answer", "")
        answer = extract_answer(LLM_answer)
        if answer:
            answers[answer] += 1

    print(answers)

In [10]:
def copy_LLM_QnA(
    src_filename,
    des_filename,
    N=10,
):
    with open(des_filename, "r") as f:
        data = f.read()
        des_articles = json.loads(data)

    with open(src_filename, "r") as f:
        data = f.read()

        src_articles = json.loads(data)
        for n in range(N):
            qna = {}
            qna["Question"] = src_articles[n]["Question"]
            qna["Answers"] = src_articles[n]["Answers"]
            qna["LLM_answer"] = src_articles[n]["LLM_answer"]
            match = re.search(r"\s*([A-E])\)", qna["LLM_answer"])
            if match:
                qna["post_LLM_answer"] = match.group(1)
            else:
                qna["post_LLM_answer"] = "NaN"

            if "QnA" in des_articles[n]:
                des_articles[n]["QnA"].append(qna)
            else:
                des_articles[n]["QnA"] = [qna]

            # overwrite the file
    with open(des_filename, "w") as f:
        json.dump(des_articles, f, ensure_ascii=False, indent=4)

### Testing on different questions

In [58]:
question = "Does this article favor any of the following parties?"
answer = "A) Bharatiya Janata Party (BJP) B) Indian National Congress (INC) C) Aam Aadmi Party (AAP) D) None"

get_LLM_QnA(
    "eng_India_election_2024.json",
    client,
    model="gpt-4o",
    QnA_prompt=question,
    QnA_answer=answer,
)
get_LLM_QnA(
    "hin_India_election_2024.json",
    client,
    model="gpt-4o",
    QnA_prompt=question,
    QnA_answer=answer,
)

In [59]:
count_num_of_answers("QnA_eng_India_election_2024.json")
count_num_of_answers("QnA_hin_India_election_2024.json")

{'A': 19, 'B': 0, 'C': 37, 'D': 44, 'E': 0}
{'A': 13, 'B': 0, 'C': 31, 'D': 55, 'E': 0}


In [60]:
copy_LLM_QnA(
    src_filename="QnA_eng_India_election_2024.json",
    des_filename="Combined_QnA_eng_India_election_2024.json",
    N=100,
)
copy_LLM_QnA(
    src_filename="QnA_hin_India_election_2024.json",
    des_filename="Combined_QnA_hin_India_election_2024.json",
    N=100,
)

In [61]:
# List of questions and answers
questions = [
    "Which key election strategy is discussed in the article?",
    "Which minority group's voting patterns are discussed in the article?",
    "What is the outcome or result mentioned in the article?",
    "Which of the following is most prominently highlighted in the article?",
    "Does the article mention any alliance group?",
]

answers = [
    "A) Public rallies B) Social media advertising C) Celebrity endorsements D) None",
    "A) Muslim community B) Christian community C) Dalit community D) None",
    "A) Victory in a specific constituency B) Formation of a new political alliance C) A significant policy change D) None of above ",
    "A) Corruption B) Money distribution C) Electoral violence D) Fake news and misinformation E) None of above",
    "A) National Democratic Alliance (NDA) B) United Progressive Alliance (UPA) C) Indian National Developmental Inclusive Alliance (INDIA) D) None",
]

for i, question in enumerate(questions):
    print(f"Generating answers for question in English: {question}")
    get_LLM_QnA(
        "eng_India_election_2024.json",
        client,
        model="gpt-4o",
        QnA_prompt=question,
        QnA_answer=answers[i],
    )
    print(f"Generating answers for question in Hindi: {question}")
    get_LLM_QnA(
        "hin_India_election_2024.json",
        client,
        model="gpt-4o",
        QnA_prompt=question,
        QnA_answer=answers[i],
    )
    print("Combining English QnA data")
    copy_LLM_QnA(
        src_filename="QnA_eng_India_election_2024.json",
        des_filename="Combined_QnA_eng_India_election_2024.json",
        N=100,
    )
    print("Combining Hindi QnA data")
    copy_LLM_QnA(
        src_filename="QnA_hin_India_election_2024.json",
        des_filename="Combined_QnA_hin_India_election_2024.json",
        N=100,
    )

Generating answers for question in English: Which key election strategy is discussed in the article?
Generating answers for question in Hindi: Which key election strategy is discussed in the article?
Combining English QnA data
Combining Hindi QnA data
Generating answers for question in English: Which minority group's voting patterns are discussed in the article?
Generating answers for question in Hindi: Which minority group's voting patterns are discussed in the article?
Combining English QnA data
Combining Hindi QnA data
Generating answers for question in English: What is the outcome or result mentioned in the article?
Generating answers for question in Hindi: What is the outcome or result mentioned in the article?
Combining English QnA data
Combining Hindi QnA data
Generating answers for question in English: Which of the following is most prominently highlighted in the article?
Generating answers for question in Hindi: Which of the following is most prominently highlighted in the art

### Getting one-line summary of the articles

In [11]:
def LLM_summary(
    art,
    client,
    model,
):

    prompt_complete = (
        art["title"]
        + "\n"
        + art["body"]
        + "\n"
        + "Write one line summary of the above article in English only"
    )
    model = model
    messages = [{"role": "user", "content": prompt_complete}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."

In [17]:
def get_LLM_summary_(
    src_filename,
    client,
    model="gpt-4o",
):
    des_articles = []
    des_filename = "Summary_" + src_filename
    if os.path.exists(des_filename):
        os.remove(des_filename)
        
    with open(src_filename, "r") as f:
        data = f.read()
        src_articles = json.loads(data)
        for src_art in src_articles:
            src_art["Summary"] = LLM_summary(
                src_art,
                client,
                model=model,
            )
            des_articles.append(src_art)

    with open(des_filename, "w") as f:
        json.dump(des_articles, f, ensure_ascii=False, indent=4)
            

In [15]:
get_LLM_summary_('Combined_QnA_eng_India_election_2024.json', client, model='gpt-4o')

In [18]:
get_LLM_summary_('Combined_QnA_hin_India_election_2024.json', client, model='gpt-4o')