In [None]:
import ast
import json
import os
import pandas as pd
import uuid

from docx import Document
from docx.shared import Inches, RGBColor

from io import StringIO

from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas()

from pylatexenc.latex2text import LatexNodes2Text

In [None]:
original_file = '2021_Wightman-Posthuma_A_genomewide_association_study_with_112_563_individuals_identifies_new_risk_loci_for_Alzheimers_disease'
external_id = '2023_05_02_27142069922ab9506d3dg'
title = '2021 Wightman Posthuma'

data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, 'diygenomics-projects', 'experiment-a', 
                                       original_file, 'mathpix', *args)

index_col = 'uuid'

summary_file = 'summary_of_summaries'
topic_file = 'topic_counts'
image_results_file = 'image_results.csv'
snps_file = f'gpt_snps_{external_id}.csv'

In [None]:
df_image_results = pd.read_csv(file_path(external_id, image_results_file), index_col=index_col)

In [None]:
df_snps = pd.read_csv(file_path(snps_file), index_col=index_col)

In [None]:
datatables = []
asciimath = []
additional_text = []

for index, row in df_image_results.iterrows():
    tsv_data = None
    asciimath_data = None
    mathpix_response = ast.literal_eval(row['mathpix_response'])
    if 'data' in mathpix_response:
        data = mathpix_response['data']
        if len(data) > 0:
            for image_details in data:
                if image_details['type'] == 'tsv':
                    tsv_data = image_details['value']
                elif image_details['type'] == 'asciimath':
                    asciimath_data = image_details['value']
            if tsv_data:
                df_image = pd.read_csv(StringIO(tsv_data), sep='\t')
                datatables.append(df_image)
            elif asciimath_data:
                asciimath.append(asciimath_data) 
        else:
            additional_text.append(mathpix_response['text'])

In [None]:
# TODO discuss how this could be accomplished programatically
# def fix_snps_table(df):
#     for index, row in df.iterrows():
#         df.at[index, 'P value'] = row['P value'].replace('xx', 'x')
    
#     return df

# df_snps = fix_snps_table(datatables[1])

# df_snps['uuid'] = [uuid.uuid4() for _ in range(len(df_snps))]
# df_snps.set_index('uuid', inplace=True)

# df_snps.to_csv(file_path('extracted_snps.csv'))

In [None]:
file_list = os.listdir(file_path('charts'))
df_charts = pd.DataFrame({'filename': file_list})

topics_files = df_charts[df_charts['filename'].str.startswith('topic_')]
topics_files = topics_files[~topics_files['filename'].str.contains(topic_file)]
topics_files = sorted(topics_files['filename'].tolist())

In [None]:
with open(file_path(f'{summary_file}.txt'), 'r') as file:
    main_summary = file.read()

In [None]:
document = Document()
word_file = file_path('insights.docx')

In [None]:
document.add_heading(f'Insights for {title}')
document.add_heading('Summary', level=2)
document.add_paragraph(main_summary)

In [None]:
snps_containers = []
relevant_containers = []

for index, row in df_snps.iterrows():
    if not pd.isna(row['snps']):
        snps = ast.literal_eval(row['snps'])
        if 'SNP_identifiers' in snps and len(snps['SNP_identifiers']) > 0:
            snps_containers.append(snps['SNP_identifiers'])
        if 'relevant_identifiers' in snps and len(snps['relevant_identifiers']) > 0:
            relevant_containers.append(json.dumps(snps['relevant_identifiers'], indent=4))

In [None]:
document.add_heading('Snps Identifiers', level=2)

for snps_container in snps_containers:
    for snps_identifier in snps_container:
        paragraph = document.add_paragraph()
        if type(snps_identifier) == str:
            paragraph.add_run(snps_identifier + '\n')
        else:
            for key, value in snps_identifier.items():
                paragraph.add_run(key.capitalize() + ': ').bold = True
                paragraph.add_run(value + '\n')

In [None]:
def add_df_to_doc(current_df, document):
    table = document.add_table(current_df.shape[0]+1, current_df.shape[1])
    table.style = 'Table Grid'
    
    hdr_cells = table.rows[0].cells
    for i in range(current_df.shape[1]):
        hdr_cells[i].text = current_df.columns[i]
        
    for i in range(current_df.shape[0]):
        row_cells = table.rows[i+1].cells
        for j in range(current_df.shape[1]):
            row_cells[j].text = str(current_df.values[i,j])

In [None]:
datatables_sorted = sorted(datatables, key=lambda x: len(x), reverse=True)

document.add_heading('Extracted Tables', level=2)

for current_df in datatables_sorted:
    current_df.rename(columns={col: '' if col.startswith('Unnamed') else col for col in current_df.columns}, inplace=True)
    add_df_to_doc(current_df, document)
    document.add_paragraph('', style='Normal')

In [None]:
document.add_heading('Relevant Identifiers', level=2)

for sublist in relevant_containers:
    sublist = ast.literal_eval(sublist)
    if isinstance(sublist[0], dict):
        for current_dict in sublist:
            for key, value in current_dict.items():
                document.add_paragraph(f"{key.capitalize()}: {value}")
    else:
        document.add_paragraph(' '.join(sublist))

In [None]:
# table = document.add_table(rows=4, cols=1)
# p = table.rows[2].cells[0].paragraphs[0]
# run = p.add_run('Topics')
# run.bold = True
# font = run.font
# font.color.rgb = RGBColor(0x42, 0x24, 0xE9)
# container = table.rows[3].cells[0].add_paragraph().add_run()
# container.add_picture(file_path('charts', f'{topic_file}.png'), width=Inches(6.0))

In [None]:
document.save(word_file) 