In [1]:
import ast
import docx
import json
import os
import pandas as pd
import papermill as pm
import scrapbook as sb
import uuid

from docx import Document
from docx.shared import Inches, RGBColor

from io import StringIO

from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas()

from pylatexenc.latexwalker import LatexWalker, LatexMacroNode, LatexCharsNode, LatexEnvironmentNode

0it [00:00, ?it/s]

In [2]:
# base_name = '2020_Andrews_GWAS_review_nihms-1645148'
# work_bucket = 'priority_1'
# external_id ='2023_05_10_82e0e0b9013821c98816g'
# human_title = '2020 Andrews GWAS'

In [3]:
data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, 'diygenomics-projects', 'experiment-a', work_bucket,
                                       base_name, 'mathpix', *args)

index_col = 'uuid'

image_results_file = 'image_results.csv'
laytex_file = f'{external_id}.tex'
json_mmd_file = f'{external_id}.lines.mmd.json'


table_font_size = docx.shared.Pt(8)

In [4]:
def fix_snps_table(df):
    for index, row in df.iterrows():
        if 'P value' in df.columns:
            df.at[index, 'P value'] = row['P value'].replace('xx', 'x')
    
    return df

In [5]:
def get_chart_title(target_text, data):
    previous_line = None
    for page in data['pages']:
        for line in page['lines']:
            if target_text in line['text']:
                break
            
            previous_line = line['text']
            if '![](https://cdn.mathpix.com' in previous_line:
                previous_line = None

        if target_text in line['text']:
            break
    return previous_line

In [6]:
def add_df_to_doc(current_df, document):
    table = document.add_table(current_df.shape[0]+1, current_df.shape[1])
    table.style = 'Table Grid'
    
    hdr_cells = table.rows[0].cells
    for i in range(current_df.shape[1]):
        hdr_cells[i].text = current_df.columns[i]
        
    for i in range(current_df.shape[0]):
        row_cells = table.rows[i+1].cells
        for j in range(current_df.shape[1]):
            row_cells[j].text = str(current_df.values[i,j])
            
    for row in table.rows:
        for cell in row.cells:
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    run.font.size = table_font_size

In [7]:
with open(file_path(json_mmd_file), 'r') as f:
    mmd_file = json.load(f)

In [8]:
with open(file_path(external_id, laytex_file), 'r') as f:
    latex_code = f.read()
    
walker = LatexWalker(latex_code)

In [9]:
# title_nodes = []
title = None
add_title_nodes = False

for node in walker.get_latex_nodes():
    if isinstance(node, list):
        for sub_node in node:
            if isinstance(sub_node, LatexEnvironmentNode):
                children = sub_node.nodelist
                for child_node in children:
                    if isinstance(child_node, LatexMacroNode) and child_node.macroname == 'maketitle':
                        add_title_nodes = True
                    # elif isinstance(child_node, LatexMacroNode) and child_node.macroname == 'section' and add_title_nodes:
                    #     add_title_nodes = False
                        
                    if isinstance(child_node, LatexCharsNode) and add_title_nodes:
                        title = child_node.chars
                        break

In [10]:
df_image_results = pd.read_csv(file_path(external_id, image_results_file), index_col=index_col)

In [11]:
datatables = []
asciimath = []
additional_text = []

for index, row in df_image_results.iterrows():
    tsv_data = None
    asciimath_data = None
    mathpix_response = ast.literal_eval(row['mathpix_response'])
    if 'data' in mathpix_response:
        data = mathpix_response['data']
        if len(data) > 0:
            for image_details in data:
                if image_details['type'] == 'tsv':
                    tsv_data = image_details['value']
                elif image_details['type'] == 'asciimath':
                    asciimath_data = image_details['value']
            if tsv_data:
                df_image = pd.read_csv(StringIO(tsv_data), sep='\t')
                datatables.append({'data': df_image, 'file_name': row['file_name'], 'include_report': True, 'export': True})
            elif asciimath_data:
                asciimath.append(asciimath_data) 
        else:
            additional_text.append(mathpix_response['text'])

In [12]:
document = Document()
word_file = file_path('insights.docx')

In [13]:
document.add_heading(human_title)
document.add_heading('Relevant Identifiers')
document.add_paragraph(title.rstrip())
document.add_heading('Extracted Tables')

<docx.text.paragraph.Paragraph at 0x12edf7ad0>

In [14]:
len(datatables)

4

In [15]:
# datatables[3]

In [16]:
# index = 3
# datatables[index]['include_report'] = True
# datatables[index]['export'] = True

In [17]:
report_datatables = []

for data_info in datatables:
    if data_info['include_report'] == True:
        df_snps = data_info['data']
        df_snps = fix_snps_table(df_snps)
        
        title_line = get_chart_title(data_info['file_name'], mmd_file)
        
        df_snps.attrs['title'] = title_line
        report_datatables.append(df_snps)
        
        df_snps['uuid'] = [uuid.uuid4() for _ in range(len(df_snps))]
        df_snps.set_index('uuid', inplace=True)
        extracted_table_file_name = data_info['file_name'].replace('.', '_').replace('(', '_').replace(')', '_')
        df_snps.to_csv(file_path(f'extracted_table_for_{extracted_table_file_name}.csv'))
        
datatables_sorted = sorted(report_datatables, key=lambda x: len(x), reverse=True)

for current_df in datatables_sorted:
    current_df.rename(columns={col: '' if col.startswith('Unnamed') else col for col in current_df.columns}, inplace=True)
    document.add_paragraph(current_df.attrs['title'])
    add_df_to_doc(current_df, document)
    document.add_paragraph('', style='Normal')

In [18]:
document.save(word_file) 

In [19]:
sb.glue('status', 'completed')