In [1]:
import wikipediaapi
from modeling_gpt import GPTInference

In [77]:
def make_sec_content(section_text, sections):
    sec_content = section_text
    for sec in sections:
        title = sec.title
        text = sec.text
        # print(sec)
        sec_content +=  str(sec)
    return sec_content

def make_held_in_out_sections(page):
    leave_out_sec = ['See also', 'External links', 'Bibliography', 'References', 'Notes', 'Further reading']
    sections = [each for each in page.sections if each.title not in leave_out_sec]
    num_held_out = round(0.2 * len(sections))
    titles = [each.title for each in sections]
    print(num_held_out, len(sections),titles[:num_held_out], titles[num_held_out:])
    held_in_section_content = []
    held_out_section_content = []
    for sec in sections[:num_held_out]:
        sec_content = make_sec_content(sec.text, sec.sections)
        held_in_section_content += [sec_content]
        
    for sec in sections[num_held_out:]:
        sec_content = make_sec_content(sec.text, sec.sections)
        held_out_section_content += [sec_content]
    
    return held_in_section_content, held_out_section_content
    
        

In [78]:
gpt_inference = GPTInference()
def make_questions(sections):
    all_questions = []
    instruction = 'For the following article, write a general question that can be answered using a summary of the article below'
    for section in sections:
        prompt = f'{instruction}\nArticle: {section}\nQuestion:'
        # print(prompt)
        question = gpt_inference.get_chatgpt_response(prompt)
        all_questions += [question]
    return all_questions

In [79]:
wiki_wiki = wikipediaapi.Wikipedia('LLM hallucination', 'en')
wiki_pages = ['Michael_Jackson', 'Donald_Trump', 'Elizabeth_II', 'India', 'Barack_Obama']



In [80]:
def get_page_held_in_out_questions(page_name):
    page = wiki_wiki.page(page_name)
    print("Page - Exists: %s" % page.exists())
    held_in_section_content, held_out_section_content = make_held_in_out_sections(page)
    article = '\n'.join(held_in_section_content)
    held_out_article = '\n'.join(held_out_section_content)
    
    held_in_sec_questions = make_questions(held_in_section_content)
    held_out_sec_questions = make_questions(held_out_section_content)
    
    return article, held_out_article, held_in_sec_questions, held_out_sec_questions

In [81]:
import random

hash = random.getrandbits(128)

print("hash value: %032x" % hash)

hash value: 7f10d00ad5a784133a11ebcf6875f441


In [82]:
import time
df_dict = {'Article': [],
           'Held_Out_Article': [],
          'Query': [],
          'Faithful_Query':[],
          'Wikipage': [],
          
          }

for wiki_page in wiki_pages:
    print(wiki_page)
    article, held_out_article, faithful_questions, out_of_article_questions = get_page_held_in_out_questions(wiki_page)
    page_url =  wiki_wiki.page(wiki_page).fullurl
    df_dict['Article'] += [article] * len(faithful_questions + out_of_article_questions)
    df_dict['Held_Out_Article'] += [held_out_article]* len(faithful_questions + out_of_article_questions)
    df_dict['Query'] += faithful_questions + out_of_article_questions
    df_dict['Faithful_Query'] += ['yes'] * len(faithful_questions)
    df_dict['Faithful_Query'] += ['no'] * len(out_of_article_questions)
    df_dict['Wikipage'] += [page_url] * len(faithful_questions + out_of_article_questions)
    time.sleep(4)


Michael_Jackson
Page - Exists: True
2 10 ['Life and career', 'Death'] ['Legacy', 'Philanthropy and humanitarian work', 'Artistry', 'Honors and awards', 'Earnings', 'Discography', 'Filmography', 'Tours']
Donald_Trump
Page - Exists: True
1 7 ['Personal life'] ['Business career', 'Media career', 'Political career', 'Presidency (2017–2021)', 'Post-presidency (2021–present)', 'Public image']
Elizabeth_II
Page - Exists: True
2 8 ['Early life', 'Heir presumptive'] ['Reign', 'Death', 'Legacy', 'Titles, styles, honours, and arms', 'Issue', 'Ancestry']
India
Page - Exists: True
2 9 ['Etymology', 'History'] ['Geography', 'Biodiversity', 'Politics and government', 'Foreign, economic and strategic relations', 'Economy', 'Demographics, languages and religion', 'Culture']
Barack_Obama
Page - Exists: True
2 8 ['Early life and career', 'Legal career'] ['Legislative career', 'Presidential campaigns', 'Presidency (2009–2017)', 'Cultural and political image', 'Post-presidency (2017–present)', 'Legacy']


In [71]:
pg = wiki_wiki.page('Michael_Jackson')
leave_out_sec = ['See also', 'External links', 'Bibliography']
# [each.title for each in pg.sections if each.text and each.title not in leave_out_sec]

In [None]:
df

In [13]:
# df_dict

In [83]:
import pandas as pd
df_dict['id'] = [random.getrandbits(128) for i in range(0, len(df_dict['Article']))]
df_dict['system_id'] = ['gpt3.5'] * len(df_dict['Article'])
df = pd.DataFrame(df_dict)

In [85]:
df.to_csv('/home/ramprasad.sa/qf_sd_summarization/datasets/postprocessed/question_article_pair.csv')

In [86]:
len(df[df['Faithful_Query'] == 'no'])

33