In [None]:
import ast
import json
import os
import pandas as pd
from io import StringIO

# from docx import Document
# from docx.shared import Inches, RGBColor

from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas()

from pylatexenc.latex2text import LatexNodes2Text

In [None]:
product_name = 'mel_swan'
project_name = 'diygenomics'

original_file = '2021_Wightman-Posthuma_A_genomewide_association_study_with_112_563_individuals_identifies_new_risk_loci_for_Alzheimers_disease'
external_id = '2023_05_02_27142069922ab9506d3dg'

data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, 'eric-client-projects', product_name, project_name, 'experiment-a', 
                                       original_file, 'mathpix', *args)

index_col = 'uuid'

summary_file = 'summary_of_summaries'
topic_file = 'topics'
image_results_file = 'image_results.csv'
snps_file = f'gpt_snps_{external_id}.csv'

In [None]:
df_image_results = pd.read_csv(file_path(external_id, image_results_file), index_col=index_col)

In [None]:
df_snps = pd.read_csv(file_path(snps_file), index_col=index_col)

In [None]:
datatables = []
asciimath = []
additional_text = []

for index, row in df_image_results.iterrows():
    tsv_data = None
    asciimath_data = None
    mathpix_response = ast.literal_eval(row['mathpix_response'])
    if 'data' in mathpix_response:
        data = mathpix_response['data']
        if len(data) > 0:
            for image_details in data:
                if image_details['type'] == 'tsv':
                    tsv_data = image_details['value']
                elif image_details['type'] == 'asciimath':
                    asciimath_data = image_details['value']
            if tsv_data:
                df_image = pd.read_csv(StringIO(tsv_data), sep='\t')
                datatables.append(df_image)
            elif asciimath_data:
                asciimath.append(asciimath_data) 
        else:
            additional_text.append(mathpix_response['text'])

In [None]:
file_list = os.listdir(file_path('charts'))
df_charts = pd.DataFrame({'filename': file_list})

topics_files = df_charts[df_charts['filename'].str.startswith('topic_')]
topics_files = topics_files[~topics_files['filename'].str.contains(topic_file)]
topics_files = sorted(topics_files['filename'].tolist())

In [None]:
with open(file_path(f'{summary_file}.txt'), 'r') as file:
    main_summary = file.read()

In [None]:
document = Document()
word_file = file_path('insights.docx')

In [None]:
document.add_heading(f'Insights for {project_name.capitalize()}')
document.add_heading('Summary', level=2)
document.add_paragraph(main_summary)

In [None]:
snps_identifiers = []
relevant_identifiers = []

for index, row in df_snps.iterrows():
    if not pd.isna(row['snps']):
        snps = ast.literal_eval(row['snps'])
        if 'SNP_identifiers' in snps and len(snps['SNP_identifiers']) > 0:
            snps_identifiers.append(json.dumps(snps['SNP_identifiers']))
        if 'relevant_identifiers' in snps and len(snps['relevant_identifiers']) > 0:
            relevant_identifiers.append(json.dumps(snps['relevant_identifiers']))

In [None]:
document.add_heading('Snps Identifiers', level=2)
for snps_identifier in snps_identifiers:
    doc.add_paragraph(snps_identifier)

document.add_heading('Relevant Identifiers', level=2)
for relevant_identifier in relevant_identifiers:
    doc.add_paragraph(relevant_identifier)

In [None]:
def add_df_to_doc(df):
    table = doc.add_table(df.shape[0]+1, df.shape[1])
    table.style = 'Table Grid'
    
    hdr_cells = table.rows[0].cells
    for i in range(df.shape[1]):
        hdr_cells[i].text = df.columns[i]
        
    for i in range(df.shape[0]):
        row_cells = table.rows[i+1].cells
        for j in range(df.shape[1]):
            row_cells[j].text = str(df.values[i,j])

In [None]:
datatables_sorted = sorted(datatables, key=lambda x: len(x))

document.add_heading('Extracted Tables', level=2)

for df in datatables_sorted:
    df.rename(columns={col: '' if col.startswith('Unnamed') else col for col in df.columns}, inplace=True)
    add_df_to_doc(df)

In [None]:
table = document.add_table(rows=4, cols=1)
p = table.rows[2].cells[0].paragraphs[0]
run = p.add_run('Topics')
run.bold = True
font = run.font
font.color.rgb = RGBColor(0x42, 0x24, 0xE9)
sentiment_container = table.rows[3].cells[0].add_paragraph().add_run()
sentiment_container.add_picture(file_path('charts', f'{topic_file}_{i}_star.png'), width=Inches(6.0))

In [None]:
document.save(word_file) 