In [1]:
import pandas as pd
import re
from unidecode import unidecode
from tqdm import tqdm
tqdm.pandas()

from collections import Counter

from datetime import datetime
import networkx as nx

import sys
sys.path.append('../src')

from utils_tiramisu import *

import itertools 
from pathlib import Path 

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

`../cache/pdfs_word_excel_powerpoint_010924.parquet` is simply a Pandas DataFrame that contains the combined texts of the scanned/electronic PDFs and MS documents. The columns are `text`, which is the raw text, and `nodeID` which is the nodeIDs of the split single-page PDFs or the MS documents.

In [None]:
together = pd.read_parquet(
    "../cache/pdfs_word_excel_powerpoint_010924.parquet"
)
# together = pd.merge(nhgri_text.reset_index(drop = True).reset_index(), nhgri_text_paths, on="nodeID")
map_nodeID_to_docID = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:PART_OF] -> (d:Document) 
where e.fileExtension = 'pdf' 
return c.nodeID as nodeID, c.page as page, d.nodeID as documentID, e.originalPath as path
""")
all_pdfs = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:CONVERT_TO] -> (f:File) 
where e.fileExtension = 'pdf' and f.fileExtension = 'png' 
return c.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

all_ms = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['doc', 'docx', 'ppt', 'pptx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")


folder_structure = pd.concat([all_pdfs, all_ms])


map_nodeID_to_page = map_nodeID_to_docID.set_index('nodeID').to_dict()['page']
# map_nodeID_to_path = map_nodeID_to_docID.set_index("nodeID").to_dict()['path']
map_nodeID_to_docID = map_nodeID_to_docID.set_index('nodeID').to_dict()['documentID']

together['docID'] = together['nodeID'].apply(lambda x: map_nodeID_to_docID[x] if x in map_nodeID_to_docID else x)
together['page'] = together['nodeID'].apply(lambda x: map_nodeID_to_page[x] if x in map_nodeID_to_page else 0)
together = pd.merge(together, folder_structure, left_on = 'nodeID', right_on = 'nodeID')

all_excel = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['xls', 'xlsx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

together = together.loc[~together.nodeID.isin(all_excel['nodeID'].to_list())]

together['text'] = together['text'].apply(lambda x: x + " ")

together = together.sort_values(['docID', 'page']).groupby('docID').agg({"text": "sum", "path": set}).reset_index()

together['path'] = together['path'].apply(lambda x: list(x)[0])
together['text'] = together['text'].str.lower()

project_folders = {
        "ENCODE":[
            "ENCODE/Participants", "ENCODE/MS", "ENCODE/SAP", "ENCODE/OC Information",
            "ENCODE/PressRelease", "ENCODE/ENCODE_2004", "ENCODE/publications", "ENCODE/Drafts",
        "ENCODE/Data Standards", "ENCODE/encode_align_sop.pdf", "ENCODE/ENCODE-PublicationGuidelines 3-29-06.doc",
        "ENCODE/Minutes", "ENCODE/CACR", "ENCODE/SAP call minutes 3-15-06.doc", "ENCODE/Data release",
        "ENCODE/Abstracts", "ENCODE/Presentations", "ENCODE/Scaling", "ENCODE/Meeting", "ENCODE/MS2",
        "ENCODE/WorkingGroups", "ENCODE/Documents", "ENCODE/criteria", "ENCODE/Web_site", "ENCODE/Hox.doc", "ENCODE/Policy"],
        "modENCODE": ["ENCODE/modENCODE", "modENCODE"],
        "HapMap":[
 'Haplotype Map Project'],
     "HGP": [
         "Large scale sequence/human sequence", "Celera", "HGP History Summer 2011", "sequencingrampupfiles"],
    "sequence": ["Large scale sequence/Box026-010.pdf", "Sequence target files"],
    "ELSI": ["ELSI"]
}

list_of_entities = []


for i, row in tqdm(together.iterrows(), total = together.shape[0]):
    temp = []
    for group, (folder) in enumerate(project_folders):
        
        
        if any([Path("/tiramisu/"+ subfolder) in Path(row['path']).parents for subfolder in project_folders[folder]]):
            
            list_of_entities.append((True, folder, row['docID'], row['path']))
        elif any([Path("/tiramisu/"+ subfolder) == Path(row['path']) for subfolder in project_folders[folder]]):
            list_of_entities.append((True, folder, row['docID'], row['path']))
        else:
            list_of_entities.append((False, folder, row['docID'], row['path']))
projects_df = pd.DataFrame(list_of_entities, columns = ["entity", "text", 'docID', 'path'])


In [3]:
hgp = projects_df.loc[(projects_df.text == "HGP") & (projects_df.entity)].docID.unique()
hapmap = projects_df.loc[(projects_df.text == "HapMap") & (projects_df.entity)].docID.unique()
lsac = projects_df.loc[(projects_df.text == "sequence") & (projects_df.entity)].docID.unique()
encode = projects_df.loc[(projects_df.text == "ENCODE") & (projects_df.entity)].docID.unique()
modencode = projects_df.loc[(projects_df.text == "modENCODE") & (projects_df.entity)].docID.unique()
elsi = projects_df.loc[(projects_df.text == "ELSI") & (projects_df.entity)].docID.unique()

We load the email pairs detected in the other [email-pairs](email-pairs.ipynb). 

In [4]:
all_email_pairs = pd.read_parquet("../cache/all_email_pairs_240520.parquet")

In [5]:
all_email_pairs["hgp"] = all_email_pairs['documentID'].apply(lambda x: x in hgp)
all_email_pairs["hapmap"] = all_email_pairs['documentID'].apply(lambda x: x in hapmap)
all_email_pairs["lsac"] = all_email_pairs['documentID'].apply(lambda x: x in lsac)
all_email_pairs["encode"] = all_email_pairs['documentID'].apply(lambda x: x in encode)
all_email_pairs["modencode"] = all_email_pairs['documentID'].apply(lambda x: x in modencode)
all_email_pairs['elsi'] = all_email_pairs['documentID'].apply(lambda x: x in elsi)

In [None]:
all_email_pairs.shape

In [None]:
all_email_pairs.columns

In [56]:
all_email_pairs['To'] = all_email_pairs['To'].astype(int)
all_email_pairs['From'] = all_email_pairs['From'].astype(int)

In [64]:
node_list = \
pd.DataFrame(set(all_email_pairs['To'].tolist()).union(set(all_email_pairs['From'].tolist())), \
            columns = ['node'])
node_list['org'] = node_list['node'].map(dict(zip(np.hstack([all_email_pairs['To'], all_email_pairs['From']]),
    np.hstack([all_email_pairs['To_org'], all_email_pairs['From_org']]))))
node_list['category'] = node_list['org'].apply(lambda x: dict(zip(np.hstack([all_email_pairs['To_org'], all_email_pairs['From_org']]),
    np.hstack([all_email_pairs['To_category'], all_email_pairs['From_category']])))[x])
node_list['ID'] = node_list['node']
node_list.to_csv('../models/email_clean_manual/nodes_240520_not_randomized.csv', index=False)

In [None]:
all_email_pairs.shape

We randomize the keys before publishing the supplementary data.

In [42]:
import random
import numpy as np

In [None]:
len(list(set(all_email_pairs.To.astype(int).to_list() + all_email_pairs.From.astype(int).to_list())))

In [32]:
original_list = list(set(all_email_pairs.To.astype(int).to_list() + all_email_pairs.From.astype(int).to_list()))

In [31]:
new_keys = random.sample(range(0, 500), 500)

In [33]:
mapping = dict(zip(original_list, new_keys))

In [37]:
all_email_pairs['To'] = all_email_pairs['To'].astype(int)
all_email_pairs['From'] = all_email_pairs['From'].astype(int)

In [38]:
all_email_pairs['To_new'] = all_email_pairs['To'].map(mapping)
all_email_pairs['From_new'] = all_email_pairs['From'].map(mapping)

In [63]:
node_list = \
pd.DataFrame(set(all_email_pairs['To_new'].tolist()).union(set(all_email_pairs['From_new'].tolist())), \
            columns = ['node'])
node_list['org'] = node_list['node'].map(dict(zip(np.hstack([all_email_pairs['To_new'], all_email_pairs['From_new']]),
    np.hstack([all_email_pairs['To_org'], all_email_pairs['From_org']]))))
node_list['category'] = node_list['org'].apply(lambda x: dict(zip(np.hstack([all_email_pairs['To_org'], all_email_pairs['From_org']]),
    np.hstack([all_email_pairs['To_category'], all_email_pairs['From_category']])))[x])
node_list['ID'] = node_list['node']
node_list.to_csv('../models/email_clean_manual/nodes_240520.csv', index=False)

In [53]:
def projects(row):
    if row['hgp']:
        project = "HGP"
    elif row['elsi']:
        project = 'ELSI'
    elif row['hapmap']:
        project = 'HapMap'
    elif row['lsac']:
        project = "LSAC"
    else:
        project= "Other"

    return project

In [54]:
all_email_pairs['project'] = all_email_pairs.apply(lambda x: projects(x), axis = 1)

In [55]:
edges = all_email_pairs[['From_new', 'To_new', 'date', 'project']]
edges = edges.groupby(["From_new", "To_new", "project"]).count().reset_index()
# edges['date'] = edges['date'].dt.year
edges.columns = ['SOURCE', 'TARGET', 'project', 'WEIGHT']
edges.to_csv( '../models/email_clean_manual/edges_240520.csv', index = False)

We do all of the network visualization in Gephi.