In [1]:
import sys
import os
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
sys.path.append(os.path.abspath(os.path.join('..')))
from src import *
import fitz

2024-07-28 01:22:50,898 - INFO - NumExpr defaulting to 8 threads.


In [2]:
SCRIPT_DIR = os.getcwd()
PARENT_DIR = os.path.dirname(SCRIPT_DIR)
DATA_FOLDER_PATH = os.path.join(PARENT_DIR, 'data')
PDF_FOLDER_PATH = os.path.join(DATA_FOLDER_PATH, 'pdf')
CSV_FOLDER_PATH = os.path.join(DATA_FOLDER_PATH, 'csv')

os.makedirs(CSV_FOLDER_PATH, exist_ok=True)

topic_mapping_2019 = {
    'Precision Medicine, Pharmacogenomics, and Genetic Therapies': (372, 505),
    'Prenatal, Perinatal, Reproductive, and Developmental Genetics': (506, 620),
    'Genetic Counseling, ELSI, Education, and Health Services Research': (621, 750),
    'Cancer Genetics': (751, 1082),
    'Mendelian Phenotypes': (1083, 1398),
    'Bioinformatics and Computational Approaches': (1399, 1746),
    'Molecular Phenotyping and Omics Technologies': (1747, 1886),
    'Complex Traits and Polygenic Disorders': (1887, 2297),
    'Evolution and Population Genetics': (2298, 2410),
    'Molecular and Cytogenetic Diagnostics': (2411, 2617),
    'Cardiovascular Phenotypes': (2618, 2793),
    'Statistical Genetics and Genetic Epidemiology': (2794, 3063),
    'Molecular Effects of Genetic Variation': (3064, 3212),
    'Epigenetics and Gene Regulation': (3213, 3364),
}

def assign_topic_2019(id):
    for topic, (start, end) in topic_mapping_2019.items():
        if start <= int(id) <= end:
            return topic
    return None

parsers = {
    range(2013, 2019): (parser_13_to_18, None),
    2019: (parser_19, assign_topic_2019),
    2021: (parser_21, None),
    2022: (parser_22, None),
}

def get_parser(year, filename):
    for year_range, parser_info in parsers.items():
        if isinstance(year_range, range):
            if year in year_range:
                return parser_info
        elif year == year_range:
            return parser_info
    if year == 2023:
        return (parser_23_poster if 'Poster' in filename else parser_23_non_poster, None)
    return None

def process_pdf(file_path, parser, topic_assigner=None):
    file = fitz.open(file_path)
    df = parser(file)
    if topic_assigner:
        df['header'] = df['id'].apply(topic_assigner)
    return df

def process_pdfs_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            match = re.search(r'(\d{4})', filename)
            if match:
                year = int(match.group(1))
                file_path = os.path.join(folder_path, filename)
                var_name = os.path.splitext(filename)[0]
                parser_info = get_parser(year, filename)
                
                if year == 2021:
                    file = fitz.open(file_path)
                    logging.info(f'Processing {filename}')
                    dfs = parser_21(file)
                    suffixes = ['Plenary', 'Platform', 'Talks', 'Presentations']
                    for df, suffix in zip(dfs, suffixes):
                        df['year'] = year
                        csv_path = os.path.join(CSV_FOLDER_PATH, f'{var_name}-{suffix}.csv')
                        df.to_csv(csv_path, index=False, escapechar='\\')
                        logging.info(f'Saved DataFrame to {csv_path}')
                elif parser_info:
                    parser, topic_assigner = parser_info
                    logging.info(f'Processing {filename}')
                    df = process_pdf(file_path, parser, topic_assigner)
                    df['year'] = year
                    csv_path = os.path.join(CSV_FOLDER_PATH, f'{var_name}.csv')
                    df.to_csv(csv_path, index=False, escapechar='\\')
                    logging.info(f'Saved DataFrame to {csv_path}')
                else:
                    logging.info(f'No parser found for {filename} (Year: {year})')
            else:
                logging.info(f'Skipping {filename} (No year found)')

In [3]:
process_pdfs_in_folder(PDF_FOLDER_PATH)

2024-07-28 01:22:53,530 - INFO - Processing 2013-poster-abstracts.pdf
2024-07-28 01:23:25,921 - INFO - Saved DataFrame to /Users/shunji/ashg-topic-trend-analysis/data/csv/2013-poster-abstracts.csv
2024-07-28 01:23:25,923 - INFO - Processing 2016-plenary_platform_abstracts.pdf
2024-07-28 01:23:30,058 - INFO - Saved DataFrame to /Users/shunji/ashg-topic-trend-analysis/data/csv/2016-plenary_platform_abstracts.csv
2024-07-28 01:23:30,059 - INFO - Processing 2016-poster-abstracts.pdf
2024-07-28 01:24:05,416 - INFO - Saved DataFrame to /Users/shunji/ashg-topic-trend-analysis/data/csv/2016-poster-abstracts.csv
2024-07-28 01:24:05,416 - INFO - Processing 2018-poster-abstracts.pdf
2024-07-28 01:24:42,380 - INFO - Saved DataFrame to /Users/shunji/ashg-topic-trend-analysis/data/csv/2018-poster-abstracts.csv
2024-07-28 01:24:42,381 - INFO - Processing ASHG2023-PosterAbstracts.pdf
2024-07-28 01:25:22,690 - INFO - Saved DataFrame to /Users/shunji/ashg-topic-trend-analysis/data/csv/ASHG2023-PosterAbs

In [6]:
po_2016_pdf = fitz.open(os.path.join(PDF_FOLDER_PATH, '2016-poster-abstracts.pdf'))

In [7]:
po_2016_pdf[37].get_text('dict')

{'width': 612.0,
 'height': 792.0,
 'blocks': [{'number': 0,
   'type': 0,
   'bbox': (223.2689971923828,
    18.40472412109375,
    575.9984130859375,
    28.45941162109375),
   'lines': [{'spans': [{'size': 7.5,
       'flags': 16,
       'font': 'Arial-BoldMT',
       'color': 526601,
       'ascender': 0.9052734375,
       'descender': -0.2119140625,
       'text': 'Statistical Genetics and Genetic Epidemiology',
       'origin': (223.2689971923828, 25.460205078125),
       'bbox': (223.2689971923828,
        18.670654296875,
        388.7425842285156,
        27.049560546875)}],
     'wmode': 0,
     'dir': (1.0, 0.0),
     'bbox': (223.2689971923828,
      18.670654296875,
      388.7425842285156,
      27.049560546875)},
    {'spans': [{'size': 9.0,
       'flags': 16,
       'font': 'Arial-BoldMT',
       'color': 65793,
       'ascender': 0.9052734375,
       'descender': -0.2119140625,
       'text': '37',
       'origin': (565.989013671875, 26.55218505859375),
       'bbox':

In [8]:
text = 'Two\x00component mixture modelling approach integrating genetic and '
text.split('\x00')*2

['Two',
 'component mixture modelling approach integrating genetic and ',
 'Two',
 'component mixture modelling approach integrating genetic and ']