In [47]:
from models.GPT_3.gpt3 import GPTInference
from models.Flan_t5.flan_t5 import FlanT5Inference

In [48]:
import pandas as pd
import dataset_creators.config as config

In [49]:
parent_path = '/home/ramprasad.sa/factual_annotation_llm_summaries'
dataset_path = 'datasets/news'

# dataset_path_map = {'xsum': 'datasets/news_sample/xsum_sample.csv', 'cnndm': 'datasets/news_sample/cnndm_sample.csv'}
model_map = {'gpt3' : GPTInference(), 'flant5': FlanT5Inference()}

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [50]:

def generate_summaries(dset, model, filter_keys = [], num_samples = 5):
    df = pd.read_csv(f'{parent_path}/datasets/{dset}/test_sample.csv')
    if filter_keys:
        print(f'{filter_keys[0]} == {filter_keys[1]}')
        df = df[df[filter_keys[0]] == filter_keys[1]]
    df = df[:num_samples]
    model_class = model_map[model]
    instructions = config.instructions
    
    summaries = {}
    for idx, row in df.iterrows():
        if idx %10 == 0:
            print(idx)
        article = row['article']
        summary = row['reference_summary']
        instruction = instructions[f'{dset}_{model}']
        for instr_key, instr in instruction.items():
            
            summary = model_class.get_news_response(article, instr)
            if instr_key not in summaries:
                summaries[instr_key] = []
            summaries[instr_key].append(summary)
    for column_key, vals in summaries.items():
        df[column_key] = vals 
    df['system_id'] = [model] * len(df)
    return df

In [51]:
# from random import shuffle
# model = 'flan_t5'
# dset = 'news'
# df_xsum = get_news_summaries('news', 'flan_t5', ['origin', 'xsum'])
# df_cnndm = get_news_summaries('news', 'flan_t5', ['origin', 'cnndm'])
# df_news = pd.concat([df_xsum, df_cnndm])
# # shuffle(df_news)
# df_news.to_csv(f'{parent_path}/datasets/{dset}/{model}_test_sample.csv')

In [30]:
df = pd.read_csv(f'{parent_path}/datasets/{dset}/test_sample.csv', encoding='utf-8')

In [31]:
df['article'].values[0]

'With the advances in the field of glycosciences and an increasing number of structures elucidated and applied in all areas of the field, the need for reliable approaches to the synthesis of glycans has grown exponentially.1 Traditional glycan synthesis in solution involves iteration of glycosylation and deprotection steps with interim purification for practically every intermediate. Some advanced strategies based on either chemoselective or selective activation of building blocks help to streamline the oligosaccharide assembly significantly.2 However, no universal route to the chemical synthesis of glycans can be established, which dramatically hinders progress in glycosciences, whereas other biopolymers, peptides3,4 and oligonucleotides,5 can be produced by machines. Solid-phase synthesis eliminates the need for conventional reaction work-up and purification of intermediates,6\\xe2\\x80\\x938 and offers promising automation amenability. Since early efforts in 2001,9 Seeberger et al. 

In [32]:
model = 'flant5'
dset = 'pubmed'
df_dset = get_news_summaries(dset, model)
df_dset

0


Unnamed: 0.1,Unnamed: 0,article,reference_summary,id,origin,Generic_summary,Faithful_summary,system_id
0,0,congenital adrenal hyperplasia ( cah ) refers ...,congenital adrenal hyperplasia is a group of a...,f05ee546-a3c4-40fb-b604-9497cb165de8,pubmed,a retrospective study of 29 patients with cong...,a retrospective study of 29 patients with cong...,flant5
1,1,the family is the cornerstone of human social ...,background : since the family is a social syst...,7bca5d77-7541-4ff9-a46f-d446b72dd9cd,pubmed,the purpose of this study was to analyze the s...,the purpose of this study was to analyze the s...,flant5
2,2,development of human societies and industriali...,background and objective : anxiety and depre...,0fc68a72-a6d3-4ec9-8f43-61124446a064,pubmed,iran is a country with a high prevalence of ca...,", the most important complication of heart sur...",flant5
3,3,male macroprolactinomas ( mprl ) are usually r...,background : suppurative meningitis ( sm ) or ...,86ed3313-6fde-46d4-b555-0afd6821f378,pubmed,suppurative meningitis is a rare complication ...,suppurative meningitis is a rare complication ...,flant5
4,4,the femoral head often leads to healing compli...,fracture of the femoral neck continues to be a...,861ad9de-1ed4-4dc2-94f3-b4ec29b8e99e,pubmed,femoral neck fractures are a common cause of n...,We report a case of non - union of the fractur...,flant5


In [33]:
df_dset.to_csv(f'{parent_path}/datasets/{dset}/{model}_test_sample.csv')

In [28]:
import math 
import string 
import re

import json 

import pandas as pd 
import sqlite3



create_table_generated_summaries_str =  '''CREATE TABLE generated_summaries (
    uuid INTEGER PRIMARY KEY AUTOINCREMENT, 
    summary_uuid TEXT NOT NULL ,
    summ_id TEXT NOT NULL, 
    system_id TEXT NOT NULL, 
    summary TEXT NOT NULL,
    article TEXT
);'''

create_table_label_str = '''CREATE TABLE label (
    uuid INTEGER PRIMARY KEY AUTOINCREMENT, 
    user_id TEXT NOT NULL,
    summary_uuid TEXT NOT NULL,
    summ_id TEXT NOT NULL, 
    system_id TEXT NOT NULL,
    label_type TEXT NOT NULL,
    summary TEXT NOT NULL,
    nonfactual_sentences ENUM NOT NULL,
    article TEXT
);'''


In [38]:

def connect_to_db(db_path):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    return conn, c 

def create_table(create_str, db_path):
    conn, c = connect_to_db(db_path)
    c.execute('''%s'''%(create_str))
    conn.commit()
    conn.close()

def add_data(filename, db_path):
    df = pd.read_csv(filename)
    conn, c = connect_to_db(db_path)
    for idx, row in df.iterrows():
        summ_uuid = row['id']
        article = row['article']
        generic_summary = row['Generic_summary']
        faithful_summary = row['Faithful_summary']
        system_id = row['system_id']
        origin = row['origin']
        
        summary_uuid_generic = f'{summ_uuid}_{system_id}_gen'
        summary_uuid_faith = f'{summ_uuid}_{system_id}_faith'
        c.execute("""INSERT INTO generated_summaries (summary_uuid, summ_id, system_id, summary, article) VALUES (?, ?, ?, ?, ?)""",
                                                        (summary_uuid_generic, 
                                                        f'{origin}_generic',
                                                        system_id,
                                                        generic_summary,
                                                        article))


        c.execute("""INSERT INTO generated_summaries (summary_uuid, summ_id, system_id, summary, article) VALUES (?, ?, ?, ?, ?)""",
                                                        (summary_uuid_faith, 
                                                        f'{origin}_faithful',
                                                        system_id,
                                                        faithful_summary,
                                                        article))

    all_summaries = c.execute("""SELECT * from generated_summaries""").fetchall()
    print('Added %d generated summaries'%(len(all_summaries)))
    conn.commit()
    conn.close()
    
def create_tables(db_path, force_new = True):
    create_cmds = {
        'generated_summaries': create_table_generated_summaries_str,
        'label': create_table_label_str
    }
    
    for table_name, create_str in create_cmds.items():
        table_results = c.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';").fetchall()
        print(table_results)
        if table_results:
            if force_new:
                c.execute(f'DROP table {table_name};')
            else:
                continue
#         else:
        create_table(create_str, db_path)
            
    

In [39]:
db_path = '/home/ramprasad.sa/factual_annotation_llm_summaries/datasets/news/news_sample_summaries.db'

In [40]:
conn, c = connect_to_db(db_path)
# table_name = 'generated_summaries'
# c.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';").fetchall()

In [44]:
create_tables(db_path, force_new = False)

[('generated_summaries',)]
[('label',)]


In [45]:
filename = '/home/ramprasad.sa/factual_annotation_llm_summaries/datasets/news/gpt3_test_sample.csv'
add_data(filename, db_path)

Added 40 generated summaries


In [46]:
 len(c.execute("""SELECT * from generated_summaries""").fetchall())

40