In [402]:
import sys

sys.path.append("./gustav/src")
from gustav import ncbi, nlm

import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()
import numpy as np

import sys
sys.path.append('../src/')

from utils_tiramisu import *

from pathlib import Path

# this is the same TIRAMISU_PATH as shown in start_here.ipynb
TIRAMISU_PATH = 

from scipy.stats import fisher_exact
from statsmodels.sandbox.stats.multicomp import multipletests
from tqdm.auto import tqdm
from collections import Counter

from datetime import datetime

# load biomedical mesh information

In [4]:
mesh = nlm.mesh('descriptor')
mesh = mesh.loc[(mesh.qualifier == "MH") | (mesh.qualifier == "ENTRY")]
mesh['value'] = mesh['value'].apply(lambda x: x.split(",")[1].strip() + " " + x.split(",")[0].strip() if len(x.split(",")) == 2 else x)
mesh['value'] = mesh['value'].apply(lambda x: x.lower() + " ")

In [5]:
tree = nlm.mesh("ui2mn")
merged = pd.merge(mesh, tree, left_on = "UI", right_on = "ui")

In [None]:
mesh_patterns = []
total_mesh = merged.loc[(merged.mn.str.startswith("E")) | (merged.mn.str.startswith("G"))].groupby("UI").agg({"value": list}).reset_index()
for i, row in tqdm(total_mesh.iterrows(), total = total_mesh.shape[0]):
	mesh_patterns.append(re.compile("|".join(np.array(["\\b"+ re.escape(i.lower().strip()) + "\\b" for i in row['value']]))))

`../cache/pdfs_word_excel_powerpoint_010924.parquet` is simply a Pandas DataFrame that contains the combined texts of the scanned/electronic PDFs and MS documents. The columns are `text`, which is the raw text, and `nodeID` which is the nodeIDs of the split single-page PDFs or the MS documents.

In [7]:

# this is the compilation of all of the extracted text
together = pd.read_parquet(
    "../cache/pdfs_word_excel_powerpoint_010924.parquet"
)
# together = pd.merge(nhgri_text.reset_index(drop = True).reset_index(), nhgri_text_paths, on="nodeID")
map_nodeID_to_docID = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:PART_OF] -> (d:Document) 
where e.fileExtension = 'pdf' 
return c.nodeID as nodeID, c.page as page, d.nodeID as documentID, e.originalPath as path
""")
all_pdfs = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:CONVERT_TO] -> (f:File) 
where e.fileExtension = 'pdf' and f.fileExtension = 'png' 
return c.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

all_ms = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['doc', 'docx', 'ppt', 'pptx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")


folder_structure = pd.concat([all_pdfs, all_ms])


map_nodeID_to_page = map_nodeID_to_docID.set_index('nodeID').to_dict()['page']
# map_nodeID_to_path = map_nodeID_to_docID.set_index("nodeID").to_dict()['path']
map_nodeID_to_docID = map_nodeID_to_docID.set_index('nodeID').to_dict()['documentID']

together['docID'] = together['nodeID'].apply(lambda x: map_nodeID_to_docID[x] if x in map_nodeID_to_docID else x)
together['page'] = together['nodeID'].apply(lambda x: map_nodeID_to_page[x] if x in map_nodeID_to_page else 0)
together = pd.merge(together, folder_structure, left_on = 'nodeID', right_on = 'nodeID')

all_excel = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['xls', 'xlsx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

together = together.loc[~together.nodeID.isin(all_excel['nodeID'].to_list())]

together['text'] = together['text'].apply(lambda x: x + " ")

together = together.sort_values(['docID', 'page']).groupby('docID').agg({"text": "sum", "path": set}).reset_index()

together['path'] = together['path'].apply(lambda x: list(x)[0])
together['text'] = together['text'].str.lower()

In [None]:
matrix = np.zeros((together.shape[0], len(mesh_patterns)))

for i, pattern in tqdm(enumerate(mesh_patterns), total = len(mesh_patterns)):

	matrix[:, i] = np.array([False if pattern.search(row['text']) is None else True for j, row in together.iterrows()])

In [None]:
matrix_df = pd.DataFrame(matrix, columns = total_mesh["UI"])

matrix_df['docID'] = together['docID']

In [None]:
mesh_per_document = []
for i, row in tqdm(matrix_df.iterrows(), total = matrix_df.shape[0]):
    temp = []
    for column in matrix_df.columns:
        if row[column] == True:
            temp.append(column)
    mesh_per_document.append((row['docID'], temp))

`../../pii_detection/knowledge_base/matched_orgs_240206.parquet` and `../../pii_detection/knowledge_base/matched_identifiers_240220.parquet` is the same Pandas DataFrame that was saved during the entity recognition & disambiguation step in [start_here.ipynb](../start_here.ipynb). In that notebook, it is called `knowledge_base/matched_identifiers_240220.parquet`.

In [14]:
# people and orgs were already detected and disambiguated in earlier steps
# now we load the saved information

keywords = pd.DataFrame(mesh_per_document, columns = ['docID', 'mesh'])
orgs = pd.read_parquet("../../pii_detection/knowledge_base/matched_orgs_240206.parquet")
orgs['docID'] = orgs['nodeID'].apply(lambda x: map_nodeID_to_docID[x] if x in map_nodeID_to_docID else x)
people = pd.read_parquet("../../pii_detection/knowledge_base/matched_identifiers_240220.parquet")
people['matched'] = people.apply(lambda x: x['input'] if x['matched'] == "##PERSON##" else x['matched'], axis = 1)
people['docID'] = people['nodeID'].apply(lambda x: map_nodeID_to_docID[x] if x in map_nodeID_to_docID else x)
all_merged = pd.merge(pd.merge(orgs.groupby("docID").agg({"matched": list}).reset_index(), people.groupby("docID").agg({"matched": list}).reset_index(),\
         on ='docID', how = 'outer', suffixes = ("_org", "_people")), keywords, on = "docID", how = "outer")

In [15]:

def get_keywords_for_project(project, type_of_keyword):
    docIDs = projects_df.loc[(projects_df.text == project) & (projects_df.entity)].docID.unique()
    return all_merged.loc[all_merged.docID.isin(docIDs)][type_of_keyword].explode().to_list()


def normalize_keywords(keywords):
    total_keywords = []
    for i in keywords:
        if isinstance(i, str):
            total_keywords.append(i.lower().strip())
    return total_keywords


# all the words in human genome project folder vs ___ project
def term_frequency(project1keywords, project2keywords):
    project1keywords = normalize_keywords(project1keywords)
    project2keywords = normalize_keywords(project2keywords)
    
    project1counter = Counter(project1keywords)
    project2counter = Counter(project2keywords)

    all_words = set(project1keywords).union(set(project2keywords))

    for word in all_words:
        if word not in project1counter:
            project1counter[word] = 0
        if word not in project2counter:
            project2counter[word] = 0
    return project1counter, project2counter

def contingency_table(project1, project2, type_of_keyword):
    # number of word A in corpus1, number of all other words in corpus1
    # number of word A in corpus2, number of all other words in corpus2

    project1keywords = get_keywords_for_project(project1, type_of_keyword)
    project2keywords = get_keywords_for_project(project2, type_of_keyword)

    project1counter, project2counter = term_frequency(project1keywords, project2keywords)
    
    results = []
    for word in tqdm(project1counter.keys(), total =len(project1counter.keys())):
        numWordCorpus1 = project1counter[word]
        numWordCorpus2 = project2counter[word]

        numAllOtherCorpus1 = sum(project1counter.values()) - numWordCorpus1
        numAllOtherCorpus2 = sum(project2counter.values()) - numWordCorpus2

        

        odds_ratio, p_value = fisher_exact([[numWordCorpus1,numWordCorpus2], [numAllOtherCorpus1, numAllOtherCorpus2]])
        results.append((word, odds_ratio, p_value, numWordCorpus1 + numWordCorpus2))
    return results

In [16]:
def prepare_table_for_power(project1, project2, type_of_keyword):
    project1keywords = get_keywords_for_project(project1, type_of_keyword)
    project2keywords = get_keywords_for_project(project2, type_of_keyword)

    project1counter, project2counter = term_frequency(project1keywords, project2keywords)
    
    results = []
    for word in tqdm(project1counter.keys(), total =len(project1counter.keys())):
        numWordCorpus1 = project1counter[word]
        numWordCorpus2 = project2counter[word]

        p1 = numWordCorpus1 / sum(project1counter.values())
        p2 = numWordCorpus2 / sum(project2counter.values())

        numAllOtherCorpus1 = sum(project1counter.values()) - numWordCorpus1
        numAllOtherCorpus2 = sum(project2counter.values()) - numWordCorpus2

        

        odds_ratio, p_value = fisher_exact([[numWordCorpus1,numWordCorpus2], [numAllOtherCorpus1, numAllOtherCorpus2]])

        results.append((p1, p2, sum(project1counter.values()), sum(project2counter.values()), p_value))
    return results

In [None]:


project_folders = {
        "ENCODE":[
            "ENCODE/Participants", "ENCODE/MS", "ENCODE/SAP", "ENCODE/OC Information",
            "ENCODE/PressRelease", "ENCODE/ENCODE_2004", "ENCODE/publications", "ENCODE/Drafts",
        "ENCODE/Data Standards", "ENCODE/encode_align_sop.pdf", "ENCODE/ENCODE-PublicationGuidelines 3-29-06.doc",
        "ENCODE/Minutes", "ENCODE/CACR", "ENCODE/SAP call minutes 3-15-06.doc", "ENCODE/Data release",
        "ENCODE/Abstracts", "ENCODE/Presentations", "ENCODE/Scaling", "ENCODE/Meeting", "ENCODE/MS2",
        "ENCODE/WorkingGroups", "ENCODE/Documents", "ENCODE/criteria", "ENCODE/Web_site", "ENCODE/Hox.doc", "ENCODE/Policy"],
        "modENCODE": ["ENCODE/modENCODE", "modENCODE"],
        "HapMap":[
 'Haplotype Map Project'],
     "HGP": [
         "Large scale sequence/human sequence", "Celera", "HGP History Summer 2011", "sequencingrampupfiles"],
    "sequence": ["Large scale sequence/Box026-010.pdf", "Sequence target files"]
}

list_of_entities = []


for i, row in tqdm(together.iterrows(), total = together.shape[0]):
    temp = []
    for group, (folder) in enumerate(project_folders):
        
        
        if any([Path("/tiramisu/"+ subfolder) in Path(row['path']).parents for subfolder in project_folders[folder]]):
            
            list_of_entities.append((True, folder, row['docID'], row['path']))
        elif any([Path("/tiramisu/"+ subfolder) == Path(row['path']) for subfolder in project_folders[folder]]):
            list_of_entities.append((True, folder, row['docID'], row['path']))
        else:
            list_of_entities.append((False, folder, row['docID'], row['path']))


In [None]:
projects_df = pd.DataFrame(list_of_entities, columns = ["entity", "text", 'docID', 'path'])
projects_df.groupby(["text", "entity"]).count()

In [None]:
temp = projects_df.loc[projects_df.entity].groupby('text').agg({"docID": set})

all_merged['HGP'] = all_merged['docID'].progress_apply(lambda x: True if x \
    in temp.loc['HGP']['docID'] else False)

all_merged['HapMap'] = all_merged['docID'].progress_apply(lambda x: True if x \
    in temp.loc['HapMap']['docID'] else False)

all_merged['sequence'] = all_merged['docID'].progress_apply(lambda x: True if x \
    in temp.loc['sequence']['docID'] else False)

all_merged['ENCODE'] = all_merged['docID'].progress_apply(lambda x: True if x \
    in temp.loc['ENCODE']['docID'] else False)

all_merged['modENCODE'] = all_merged['docID'].progress_apply(lambda x: True if x \
    in temp.loc['modENCODE']['docID'] else False)


In [20]:
all_merged['matched_org'] = all_merged['matched_org'].fillna("").apply(list)
all_merged['matched_people'] = all_merged['matched_people'].fillna("").apply(list)

In [None]:
all_merged['total'] = all_merged.progress_apply(lambda x: x['matched_org'] + x['matched_people'] + x['mesh'], axis = 1)

In [None]:
tree['starting'] = tree['mn'].str[0]
E_labels = tree.loc[(tree.starting.isin(["E"]))].ui.unique()

G_labels = tree.loc[(tree.starting.isin(["G"])) & (~tree.starting.isin(["E"]))].ui.unique()


categories_dict = {}
union_people_org = set(all_merged['matched_org'].explode()).intersection(set(all_merged['matched_people'].explode()))
for i in tqdm(all_merged['matched_org'].explode().unique()):

    if isinstance(i, str):
        categories_dict[i.lower()] = "org"

for i in tqdm(all_merged['matched_people'].explode().unique()):
    if isinstance(i, str):
        if i in union_people_org:
            continue
        else:
            categories_dict[i.lower()] = "people"

for i in tqdm(all_merged['mesh'].explode().unique()):
    if isinstance(i, str):
        
        if i in E_labels:
            categories_dict[i.lower()] = "technique"

        if i in G_labels:
            categories_dict[i.lower()] = "phenomena"

In [None]:
project_combinations = [("HGP", "sequence"), ("HGP", "HapMap"), ("HGP", "modENCODE"), ("HGP", "ENCODE")]
exploded = all_merged.explode('total')

all_terms ={}
for combo in tqdm(project_combinations):
    results = contingency_table(combo[0], combo[1], 'total')
    pvalues = pd.DataFrame(results, columns = ['word', 'ratio', 'pvalue', 'total'])
    all_terms[combo] = pvalues['word'].to_frame()
    print(alpha_corrected)

In [None]:
project_combinations = [("HGP", "sequence"), ("HGP", "HapMap"), ("HGP", "modENCODE"), ("HGP", "ENCODE")]
contain_df_power = {}
exploded = all_merged.explode('total')
alphas = []

for combo in tqdm(project_combinations):
    results = prepare_table_for_power(combo[0], combo[1], 'total')
    pvalues = pd.DataFrame(results, columns = ['p1', 'p2', 'n1', 'n2', 'pvalue'])
    rejected, p_adjusted, _, alpha_corrected = multipletests(pvalues['pvalue'], method='bonferroni')
    alphas.append(alpha_corrected)
    
    pvalues['rejected'] = rejected
    pvalues['adjusted'] = p_adjusted
    
    contain_df_power[combo] = pvalues

We use the following code for power analysis in R.

```{r}
library(arrow)
library(statmod)
library(pbapply)
library(dplyr)
```


```{r}
d <- data.frame(projects = c("modENCODE", "ENCODE", "HapMap", "sequence"),
                alphas =c(1.5383195397347939e-06,
 1.579180089697429e-06,
 2.284565475646532e-06,
 2.231345947875759e-06))
for(i in seq_len(nrow(d))) {
  df <- read_parquet(paste("../../models/power_fishers/", d[i,]$projects, ".parquet", sep = ""))
  power <- pbapply(df, 1, simulate, alpha =d[i,]$alphas)
  
  powerData <- data.frame(power=power)
  allData <- cbind(df, power = powerData$power)
  
  write_parquet(allData, paste("../../models/power_fishers/", d[i,]$projects, "_power.parquet", sep = ""))
                            
  } 


```

```{r}


simulate <- function(x, alpha) {
    return(power.fisher.test(x[1],x[2],x[3],x[4], alpha = alpha))
}



```

```{r}
power <- pbapply(df, 1, simulate)

```

```{r}

power
```
```{r}
powerData <- data.frame(power=power)
allData <- cbind(df, power = powerData$power)
```


```{r}
write_parquet(allData, "../../models/power_fishers/modENCODE_power.parquet")
```

In [26]:
for key in contain_df_power:
    contain_df_power[key].to_parquet(f"../models/power_fishers/{key[1]}_new.parquet")

In [31]:
all_powerful_terms = []

for project in ["modENCODE", "ENCODE", "HapMap", "sequence"]:
    power = pd.read_parquet(f"../models/power_fishers/{project}_power_new.parquet")
    power['terms'] = all_terms[("HGP", project)]
    all_powerful_terms.extend(power.loc[power.power >= 0.8]['terms'].to_list())


In [47]:
all_powerful_terms = [i for i in set(all_powerful_terms) if i != ""]

# SI Figure 14

In [None]:

percentage_of_docs_with_at_least_one = []
for project in ["HGP", "sequence", "HapMap", "ENCODE", "modENCODE"]:
    temp = []
    temp.append(all_merged.loc[(all_merged[project]) & (all_merged.matched_people.str.len() > 0)].shape[0] / all_merged.loc[(all_merged[project])].shape[0])
    temp.append(all_merged.loc[(all_merged[project]) & (all_merged.matched_org.str.len() > 0)].shape[0] / all_merged.loc[(all_merged[project])].shape[0])
    

    techniques_doc = 0
    phenomena_doc = 0
    for i in all_merged.loc[(all_merged[project])]['mesh'].to_list():

        techniques = []
        phenomena = []
        for j in i:
            if categories_dict[j.lower()] == 'techniques':
                techniques.append(j.lower())
            elif categories_dict[j.lower()] == "phenomena":
                phenomena.append(j.lower())
            else:
                pass
        if len(phenomena) > 0:
            phenomena_doc += 1
        elif len(techniques) > 0:
            techniques_doc += 1
    print(techniques_doc, all_merged.loc[all_merged[project]].shape[0], project)
    print(phenomena_doc,all_merged.loc[all_merged[project]].shape[0], project)
    temp.append(techniques_doc / all_merged.loc[all_merged[project]].shape[0])
    temp.append(phenomena_doc / all_merged.loc[all_merged[project]].shape[0])
    temp.append(project)
    percentage_of_docs_with_at_least_one.append(temp)
at_least_one = pd.DataFrame(percentage_of_docs_with_at_least_one, columns = ["people", "org", "techniques", "phenomena", "project"])

In [None]:

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})


matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 7,
"font.size":7})
matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5

fig, ax = plt.subplots(figsize=(3, 2), dpi=300)

sns.barplot(data = at_least_one.melt(id_vars = "project").sort_values(by=['project', "variable"], key=lambda x: x.map({"HGP": 0, "sequence": 1, "HapMap": 2, "ENCODE": 3, "modENCODE": 4, "phenomena" : 5, "techniques": 6, "org": 7, "people":8})), x = "project", y = "value", hue = "variable", ax = ax)
ax.legend(frameon = False, bbox_to_anchor = (1, 0.5) )

def formatter(x, pos):
    del pos
    return str(int(x*100))

ax.yaxis.set_major_formatter(formatter)

ax.xaxis.label.set_color('black')
ax.tick_params(axis='x', colors='black')
ax.yaxis.label.set_color('black')
ax.tick_params(axis='y', colors='black')
ax.set_ylabel("Documents with at least one entity [%]")
ax.set_xlabel("")
sns.despine()

ax.tick_params(axis='x', colors='black')
ax.yaxis.label.set_color('black')
ax.tick_params(axis='y', colors='black')
ax.spines['bottom'].set_linewidth(0.5)
ax.spines['left'].set_linewidth(0.5)

plt.savefig("../cache/documents_with_at_least_one_entity_240904.pdf", dpi = 300, bbox_inches = "tight")

In [None]:
project_combinations = [("HGP", "sequence"), ("HGP", "HapMap"), ("HGP", "modENCODE"), ("HGP", "ENCODE")]
exploded = all_merged.explode('total')

contain_df ={}
for combo in tqdm(project_combinations):
    results = contingency_table(combo[0], combo[1], 'total')
    pvalues = pd.DataFrame(results, columns = ['word', 'ratio', 'pvalue', 'total'])
    rejected, p_adjusted, _, alpha_corrected = multipletests(pvalues['pvalue'], method='bonferroni')
    
    pvalues['rejected'] = rejected
    pvalues['adjusted'] = p_adjusted
    
    pvalues['logratio'] = pvalues['ratio'].apply(lambda x: np.log2(x))
    pvalues['category'] = pvalues['word'].map(categories_dict)
    pvalues['powerful'] = pvalues['word'].apply(lambda x: True if x in all_powerful_terms else False)
    contain_df[combo] = pvalues

In [None]:

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False
matplotlib.rcParams.update({"axes.labelsize": 7 * 3,
"xtick.labelsize": 7 * 3,
"ytick.labelsize": 7 * 3,
"legend.fontsize": 5,
"font.size":7 * 3})


matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5


matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2


layout = [
    [0, 1, 2, 3],
    [4, 5, 6, 7],
    [8, 9, 10, 11],
    [12, 13, 14, 15]
]

fig, axes = plt.subplot_mosaic(layout, figsize=(35,35))


for i, project in enumerate(["sequence", "HapMap", "ENCODE", "modENCODE"]):
    temp = contain_df[("HGP", project)]

    for j, type in enumerate(["phenomena", "technique", "org", "people"]):
        sns.scatterplot(data = temp.loc[((temp.rejected) & (temp.powerful)) & (temp.category == type)], x = 'total', y = 'logratio', color = 'red', ax = axes[(4 * i) + j])
        sns.scatterplot(data = temp.loc[((~temp.rejected) | (~temp.powerful))  & (temp.category == type)], x = 'total', y = 'logratio', color = 'grey', ax = axes[(4 * i) + j])
        axes[(4* i) + j].axhline(0, 0, 5500, c='k')
        
        axes[(4*i) + j].set_xscale("log")
        axes[(4 * i) + j].set_title(project + "-" + type + " N=" + "{:.2f}".format(temp.loc[((temp.logratio == np.inf) | (temp.logratio == -1 * np.inf)) & (temp.category == type)].shape[0] / 
                                                                       temp.loc[temp.category == type].shape[0]), fontsize = 30)
        # axes[(4*i) + j].set

    

In [None]:
matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False
matplotlib.rcParams.update({"axes.labelsize": 7 * 4,
"xtick.labelsize": 7 * 4,
"ytick.labelsize": 7 * 4,
"legend.fontsize": 5,
"font.size":7 *4 })

for i, project in enumerate(["sequence", "HapMap", "ENCODE", "modENCODE"]):
    temp = contain_df[("HGP", project)]

    for j, type in enumerate(["phenomena", "technique", "org", "people"]):

        fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize = (7,7), sharex=True, gridspec_kw = {"height_ratios": [1, 4, 1]}, dpi = 300)
        
        fig.subplots_adjust(hspace=0)  # adjust space between axes
        
        
        temp['logratio'] = temp['logratio'].apply(lambda x: 10000 if x == np.inf else x)
        temp['logratio'] = temp['logratio'].apply(lambda x: -10000 if x == -1 * np.inf else x)

        
        sns.scatterplot(data = temp.loc[((temp.rejected) & (temp.powerful)) & (temp.category == type)], x = 'total', y = 'logratio', color = 'red', ax = ax1)
        sns.scatterplot(data = temp.loc[(~temp.rejected)  & (temp.category == type)], x = 'total', y = 'logratio', color = 'grey', ax = ax1)
        sns.scatterplot(data = temp.loc[(temp.rejected & (temp.powerful)) & (temp.category == type)], x = 'total', y = 'logratio', color = 'red', ax = ax2)
        sns.scatterplot(data = temp.loc[(~temp.rejected)  & (temp.category == type)], x = 'total', y = 'logratio', color = 'grey', ax = ax2)
        sns.scatterplot(data = temp.loc[(temp.rejected & (temp.powerful)) & (temp.category == type)], x = 'total', y = 'logratio', color = 'red', ax = ax3)
        sns.scatterplot(data = temp.loc[(~temp.rejected)  & (temp.category == type)], x = 'total', y = 'logratio', color = 'grey', ax = ax3)
        ax2.axhline(0, 0, 10000, c='k')
        
        ax2.set_xscale("log")
        # zoom-in / limit the view to different portions of the data
        ax1.set_ylim(9998, 10002)  # outliers only
        ax2.set_ylim(-12 ,12)  # most of the data
        ax3.set_ylim(-10002, -9998)
        # hide the spines between ax and ax2
        ax1.spines.bottom.set_visible(False)
        ax2.spines.top.set_visible(False)
        ax2.spines.bottom.set_visible(False)
        ax3.spines.top.set_visible(False)
        
        ax1.set_yticklabels([])
        ax1.set_yticks([])
        ax3.set_yticks([])
        ax3.set_yticklabels([])
        
        ax1.tick_params(labeltop=False,which = 'both' )  # don't put tick labels at the top
        ax2.tick_params(labeltop=False, which = 'both')
        ax2.xaxis.tick_bottom()
        
        percentage = " inf:" + "{:}%".format(int((temp.loc[((temp.logratio == 10000) | (temp.logratio == -1 * 10000)) & (temp.category == type) & ((temp.rejected) & (temp.powerful))].shape[0] / 
                                                                               temp.loc[(temp.category == type)].shape[0]) * 100))
                                             
        ax1.spines['right'].set_linewidth(0)
        ax3.spines['right'].set_linewidth(0)
        ax1.spines['top'].set_linewidth(0)
        ax3.spines['top'].set_linewidth(0)
        ax2.spines['right'].set_linewidth(0)
        ax2.spines['top'].set_linewidth(0)
        ax1.set_ylabel("inf")
        ax2.set_ylabel("")
        ax3.set_ylabel("-inf")
        
        print(f"{project} - {type} - {percentage}")
        ax1.set_title("")
#         ax1.set_title(f"{project} - {type}" + percentage)
        ax1.set_xlabel("")
        ax2.set_xlabel("")
        ax3.set_xlabel("")
        plt.savefig(f"../cache/{project}-{type}_new.png", bbox_inches = "tight")

In [174]:
all_essential_terms = []

for i in [("HGP", "sequence"), ("HGP", "HapMap"), ("HGP", "modENCODE"), ("HGP", "ENCODE")]:
    temp = contain_df[i]
    all_essential_terms.extend(temp.loc[(temp.rejected) & (temp.powerful)]['word'].str.lower())

In [None]:
len(set(all_essential_terms))

In [None]:
exploded = all_merged.explode('total')
matrix = np.zeros((len(set(all_essential_terms)), 4))
exploded = exploded[['docID', "HGP", "HapMap", "ENCODE", "modENCODE", "sequence", "total"]].melt(id_vars = ["docID",  "HGP", "HapMap", "ENCODE", "modENCODE", "sequence"])
exploded['lower'] = exploded['value'].str.lower().str.strip()
for i, term in tqdm(enumerate(list(set(all_essential_terms))), total = len(set(all_essential_terms))):
    temp = exploded.loc[(exploded['lower'] == term)]
    HGP = temp.loc[temp['HGP']].docID.unique().shape[0]
    if HGP == 0:
        HGP = .1

    for j, project in enumerate(["sequence", "HapMap", "ENCODE", "modENCODE"]):
        project_found = temp.loc[(temp[project])].docID.unique().shape[0]

        if project_found == 0:
            project_found = .1
        matrix[i][j] = np.log2(project_found/ HGP)

# Figure 2B

In [82]:


to_concat = []
for i, project in enumerate([("HGP", "sequence"), ("HGP", "HapMap"), ("HGP", "modENCODE"), ("HGP", "ENCODE")]):
    temp = contain_df[project].groupby(['category']).count()['word'].to_frame()
    temp.columns = ["count"]

    temp =temp.reset_index()
    
    temp['project'] = project[1]
    temp['type'] = 'total'
    to_concat.append(temp)
    temp = contain_df[project].groupby(['category', "powerful", "rejected"]).count().reset_index()
    temp = temp.loc[temp.powerful & temp.rejected][['category', 'word']]
    temp.columns = ["category", "count"]
    temp['type'] = 'rejected'
    temp['project'] = project[1]

    to_concat.append(temp)
total_metadata = pd.concat(to_concat)

In [None]:
metadata = []
for project in ["sequence", "HapMap", "ENCODE", "modENCODE"]:
    print(project)
    for category in ["org", "people", "phenomena", "technique"]:
        print(category)
        total = total_metadata.loc[(total_metadata.category == category) & (total_metadata['project'] == project)]
        
        print(total.loc[total['type'] == "total"]['count'].iloc[0])
        print(total.loc[total['type'] == "rejected"]['count'].iloc[0])
        metadata.append((project, category, total.loc[total['type'] == "rejected"]['count'].iloc[0] \
                         / total.loc[total['type'] == "total"]['count'].iloc[0]))

In [84]:
metadata = pd.DataFrame(metadata, columns = ["project", "category", "percentage"])

In [None]:


sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})


matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 7,
"font.size":7})
matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5


fig, ax = plt.subplots(figsize=(3, 2), dpi=300)

sns.barplot(data = metadata.sort_values(by=['project'], key=lambda x: x.map({"HGP": 0, "sequence": 1, "HapMap": 2, "ENCODE": 3, "modENCODE": 4}))\
            .sort_values(by=['category'], key=lambda x: x.map({"phenomena": 0, "technique": 1, "org": 2, "people": 3})), x = 'category', y = 'percentage', hue = 'project', palette = {"HGP": "#6EC3E7",\
                    "sequence":"#51AF4D","HapMap": "#E1BE15", "ENCODE": "#095393", "modENCODE": "#AD5D95"}, ax = ax)
ax.spines['right'].set_linewidth(0)
ax.spines['top'].set_linewidth(0)
ax.legend(frameon = False, bbox_to_anchor = (0.5, 0.5))
ax.set_xlabel("")
ax.set_xticklabels(["Phenomena", "Techniques", "Organizations", "People"])
def formatter(x, pos):
    del pos
    return str(int(x*100))

ax.yaxis.set_major_formatter(formatter)

ax.xaxis.label.set_color('black')
ax.tick_params(axis='x', colors='black')
ax.yaxis.label.set_color('black')
ax.tick_params(axis='y', colors='black')
ax.set_ylabel("Significant entities [%]")
plt.savefig("../cache/figure_2_share_of_entities_new.pdf", transparent = True, dpi = 400, bbox_inches='tight')

In [177]:
all_essential_terms = []
all_categories = []
all_projects = []
for i in [("HGP", "sequence"), ("HGP", "HapMap"), ("HGP", "modENCODE"), ("HGP", "ENCODE")]:
    
    temp = contain_df[i]
    all_essential_terms.extend(temp.loc[(temp.rejected) & (temp.powerful)]['word'].str.lower())
    all_categories.extend(temp.loc[(temp.rejected) & (temp.powerful)]['category'].str.lower())
    all_projects.extend([i[1]] * temp.loc[(temp.rejected) & (temp.powerful)]['category'].str.lower().shape[0])

# SI Figure 17

In [None]:
 exploded.lower.unique().shape[0]

In [None]:
len(set(all_essential_terms)) / exploded.lower.unique().shape[0]

In [178]:
all_essential_terms_df = pd.DataFrame(all_essential_terms, columns = ['term'])
all_essential_terms_df['category'] = all_categories
all_essential_terms_df['projects'] = all_projects
all_essential_terms_df = all_essential_terms_df.loc[all_essential_terms_df.term != ""]


In [179]:

all_essential_terms_df = all_essential_terms_df.drop_duplicates()[['term', 'category']].value_counts().to_frame("occ").reset_index()

In [None]:


matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 12,
"ytick.labelsize": 7,
"legend.fontsize": 7,
"font.size":7})

fig, ax = plt.subplots(figsize=(3, 2), dpi=300)


sns.barplot(data = all_essential_terms_df.groupby(["occ", "category"]).count().reset_index().sort_values(by=["category"], key=lambda x: x.map({"HGP": 0, "sequence": 2, "HapMap": 1, "ENCODE": 3, "modENCODE": 4, "phenomena" : 5, "techniques": 6, "org": 7, "people":8})),
                 x = 'occ', y = 'term', hue = "category")
# ax.set_xticklabels([])
ax.legend(frameon = False, bbox_to_anchor = (0.85, 0.5) )
ax.set_ylabel("Keyword frequency")
ax.set_xlabel("Number of projects")

ax.tick_params(axis='x', colors='black')
ax.yaxis.label.set_color('black')
ax.tick_params(axis='y', colors='black')
ax.spines['bottom'].set_linewidth(0.5)
ax.spines['left'].set_linewidth(0.5)

sns.despine()
plt.savefig("../cache/number_of_projects_overlap_keywords.pdf", bbox_inches = "tight", dpi = 300)

In [None]:
all_essential_terms_df.groupby(["occ", "category"]).count().reset_index()

In [None]:
all_essential_terms_df.groupby(["occ", "category"]).count().reset_index().term.sum()

In [None]:

exploded = all_merged.explode('total')
matrix = np.zeros((len(set(all_essential_terms)), 5))
exploded = exploded[['docID', "HGP", "HapMap", "ENCODE", "modENCODE", "sequence", "total"]].melt(id_vars = ["docID",  "HGP", "HapMap", "ENCODE", "modENCODE", "sequence"])
exploded['lower'] = exploded['value'].str.lower().str.strip()
HGP = len(exploded.loc[(exploded['HGP'])]['docID'].unique())
sequence = len(exploded.loc[(exploded['sequence'])]['docID'].unique())
hapmap = len(exploded.loc[(exploded['HapMap'])]['docID'].unique())
encode = len(exploded.loc[(exploded['ENCODE'])]['docID'].unique())
modencode = len(exploded.loc[(exploded['modENCODE'])]['docID'].unique())
for i, term in tqdm(enumerate(list(set(all_essential_terms))), total = len(set(all_essential_terms))):
    temp = exploded.loc[(exploded['lower'] == term)]
    matrix[i][0] = temp.loc[(temp['HGP'])].docID.unique().shape[0] / HGP
    matrix[i][1] = temp.loc[(temp['sequence'])].docID.unique().shape[0] / sequence
    matrix[i][2] = temp.loc[(temp['HapMap'])].docID.unique().shape[0] / hapmap
    matrix[i][3] = temp.loc[(temp['ENCODE'])].docID.unique().shape[0] / encode
    matrix[i][4] = temp.loc[(temp['modENCODE'])].docID.unique().shape[0] / modencode

In [183]:
import scipy.spatial as sp, scipy.cluster.hierarchy as hc

In [184]:
matrix_df = pd.DataFrame(matrix, columns = ["HGP", "sequence", "HapMap", "ENCODE", "modENCODE"], index = list(set(all_essential_terms)))

In [185]:
def zscore(x):
	return (x-np.mean(x)) / np.std(x)

def normalize_array(x):
    min_val = min(x)
    max_val = max(x)
    range_val = max_val - min_val
    
    normalized_arr = ((x - min_val) / range_val) * 2 - 1
    
    return normalized_arr

In [238]:


pal = ["#6EC3E7", "#51AF4D", "#72c4a0", "#E1BE15", "#095393", "#AD5D95", "#5b5895","#2c8171",]
# pal = sns.color_palette("Paired", 10)
pal = ["#E1BE15", "#7F8671", "#51AF4D",   "#2B5B2A", "#050606", "#AD5D95", "#072D4D",  "#095393",  "#5B5894"]

# Figure 2C

In [None]:

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5


matplotlib.rcParams.update({"axes.labelsize": 10,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
"legend.fontsize": 14,
"font.size":7})

dendrograms = hc.linkage(matrix_df.apply(zscore, axis = 1), method='ward')

# clusters = hc.fcluster(dendrograms, t =4, criterion = 'distance')
g = sns.clustermap(matrix_df.apply(zscore, axis = 1), row_linkage = dendrograms, row_cluster = True, col_cluster=False, \
                   yticklabels = False, xticklabels = True, cmap = 'coolwarm', 
                   row_colors =  [pal[get_color(i)] for i in list(set(all_essential_terms))],
                   cbar_kws = dict(orientation="vertical", ticks = [-2, -1, 0, 1, 2]), \
                   tree_kws = {"linewidth": 0.5, "color":'k'}, center = 0, vmin = -2, vmax = 2, colors_ratio=0.015, figsize=(7,15),)
# g.ax_heatmap.tick_params(left=False, bottom=False)
g.ax_cbar.set_position((0.12 , 0.1, .02, .08))

g.ax_row_dendrogram.set_visible(False)
g.ax_col_dendrogram.set_visible(False)
# g.ax_row_dendrogram.set_xlim([0,0])
# g.ax_col_dendrogram.set_ylim([0,0])
# hm = g.ax_heatmap.get_position()
# g.ax_heatmap.set_position([hm.x0, hm.y0 + 0.703, hm.width, hm.height * 0.1])
hm_row = g.ax_col_dendrogram.get_position()
g.ax_col_dendrogram.set_position([hm_row.x0, hm_row.y0, hm_row.width * 0.1, hm_row.height * 0.25])
hm = g.ax_row_colors.get_position()
g.ax_row_colors.set_position([hm.x0, hm.y0, hm.width, hm.height * 0.25])
hm = g.ax_heatmap.get_position()
g.ax_heatmap.set_position([hm.x0, hm.y0, hm.width * 0.2, hm.height * 0.25])
# hm_row = g.ax_row_dendrogram.get_position()
# g.ax_row_dendrogram.set_position([hm_row.x0, hm_row.y0 + 0.693, hm_row.width, hm_row.height * 0.1])

# reordered_labels = matrix_df.iloc[g.dendrogram_col.reordered_ind].index.to_list()
# use_labels = ["white house", "doe", "hugo", "francis s collins", "james d watson", "elke jordan", "eric d green", "robert h waterston", "celera"]
# use_ticks = [reordered_labels.index(label.lower()) + .5 for label in use_labels]

# g.ax_heatmap.set(xticks=use_ticks, xticklabels=use_labels)
# g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=65, ha = 'right')
g.ax_heatmap.xaxis.tick_bottom()
# ax.invert_yaxis()
g.ax_heatmap.xaxis.label.set_color('black')
g.ax_heatmap.set_xticklabels(["HGP", "LSAC", "HapMap", "ENCODE", "modENCODE"])
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, fontsize = 16)
plt.show()

In [None]:
len(set(all_essential_terms))

In [None]:
clusters = hc.fcluster(dendrograms, t =10, criterion = 'distance')
terms = pd.DataFrame({"term": list(set(all_essential_terms)), "cluster": clusters})

temp = mesh.loc[mesh.qualifier == "MH"]
temp['UI'] = temp['UI'].str.lower()
terms['mesh_term'] = terms['term'].map(temp.set_index("UI")['value'].to_dict())
terms['category'] = terms['term'].map(categories_dict)

In [219]:
def get_color(term):
    return terms.loc[terms.term == term].iloc[0]['cluster']-1

In [None]:

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5


matplotlib.rcParams.update({"axes.labelsize": 10,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
"legend.fontsize": 14,
"font.size":7})

dendrograms = hc.linkage(matrix_df.apply(zscore, axis = 1), method='ward')

# clusters = hc.fcluster(dendrograms, t =4, criterion = 'distance')
g = sns.clustermap(matrix_df.apply(zscore, axis = 1), row_linkage = dendrograms, row_cluster = True, col_cluster=False, \
                   yticklabels = False, xticklabels = True, cmap = 'coolwarm', 
                   row_colors =  [pal[get_color(i)] for i in list(set(all_essential_terms))],
                   cbar_kws = dict(orientation="vertical", ticks = [-2, -1, 0, 1, 2]), \
                   tree_kws = {"linewidth": 0.5, "color":'k'}, center = 0, vmin = -2, vmax = 2, colors_ratio=0.015, figsize=(7,15),)
# g.ax_heatmap.tick_params(left=False, bottom=False)
g.ax_cbar.set_position((0.48 , 0.08, .02, .08))

g.ax_row_dendrogram.set_visible(True)
g.ax_col_dendrogram.set_visible(False)
# g.ax_row_dendrogram.set_xlim([0,0])
# g.ax_col_dendrogram.set_ylim([0,0])
# hm = g.ax_heatmap.get_position()
# g.ax_heatmap.set_position([hm.x0, hm.y0 + 0.703, hm.width, hm.height * 0.1])
hm_row = g.ax_col_dendrogram.get_position()
g.ax_col_dendrogram.set_position([hm_row.x0, hm_row.y0, hm_row.width * 0.1, hm_row.height * 0.25])
hm = g.ax_row_colors.get_position()
g.ax_row_colors.set_position([hm.x0, hm.y0, hm.width, hm.height * 0.25])
hm = g.ax_heatmap.get_position()
g.ax_heatmap.set_position([hm.x0, hm.y0, hm.width * 0.2, hm.height * 0.25])
hm_row = g.ax_row_dendrogram.get_position()
g.ax_row_dendrogram.set_position([hm_row.x0, hm_row.y0, hm_row.width, hm_row.height * 0.25])

# reordered_labels = matrix_df.iloc[g.dendrogram_col.reordered_ind].index.to_list()
# use_labels = ["white house", "doe", "hugo", "francis s collins", "james d watson", "elke jordan", "eric d green", "robert h waterston", "celera"]
# use_ticks = [reordered_labels.index(label.lower()) + .5 for label in use_labels]

# g.ax_heatmap.set(xticks=use_ticks, xticklabels=use_labels)
# g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=65, ha = 'right')
g.ax_heatmap.xaxis.tick_bottom()
# ax.invert_yaxis()
g.ax_heatmap.xaxis.label.set_color('black')
g.ax_heatmap.set_xticklabels(["HGP", "LSAC", "HapMap", "ENCODE", "modENCODE"])
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, fontsize = 16)
plt.show()

In [None]:

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5


matplotlib.rcParams.update({"axes.labelsize": 10,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
"legend.fontsize": 14,
"font.size":7})

dendrograms = hc.linkage(matrix_df.apply(zscore, axis = 1), method='ward')

# clusters = hc.fcluster(dendrograms, t =4, criterion = 'distance')
g = sns.clustermap(matrix_df.apply(zscore, axis = 1), row_linkage = dendrograms, row_cluster = True, col_cluster=False, \
                   yticklabels = False, xticklabels = True, cmap = 'coolwarm', 
                   row_colors =  [pal[get_color(i)] for i in list(set(all_essential_terms))],
                   cbar_kws = dict(orientation="vertical", ticks = [-2, -1, 0, 1, 2]), \
                   tree_kws = {"linewidth": 0.5, "color":'k'}, center = 0, vmin = -2, vmax = 2, colors_ratio=0.015, figsize=(7,15),)
# g.ax_heatmap.tick_params(left=False, bottom=False)
g.ax_cbar.set_position((0.42 , 0.08, .02, .08))

g.ax_row_dendrogram.set_visible(True)
g.ax_col_dendrogram.set_visible(False)
# g.ax_row_dendrogram.set_xlim([0,0])
# g.ax_col_dendrogram.set_ylim([0,0])
# hm = g.ax_heatmap.get_position()
# g.ax_heatmap.set_position([hm.x0, hm.y0 + 0.703, hm.width, hm.height * 0.1])
hm_row = g.ax_col_dendrogram.get_position()
g.ax_col_dendrogram.set_position([hm_row.x0, hm_row.y0, hm_row.width * 0.1, hm_row.height * 0.25])
hm = g.ax_row_colors.get_position()
g.ax_row_colors.set_position([hm.x0, hm.y0, hm.width, hm.height * 0.25])
hm = g.ax_heatmap.get_position()
g.ax_heatmap.set_position([hm.x0, hm.y0, hm.width * 0.2, hm.height * 0.25])
hm_row = g.ax_row_dendrogram.get_position()
g.ax_row_dendrogram.set_position([hm_row.x0, hm_row.y0, hm_row.width, hm_row.height * 0.25])

# reordered_labels = matrix_df.iloc[g.dendrogram_col.reordered_ind].index.to_list()
# use_labels = ["white house", "doe", "hugo", "francis s collins", "james d watson", "elke jordan", "eric d green", "robert h waterston", "celera"]
# use_ticks = [reordered_labels.index(label.lower()) + .5 for label in use_labels]

# g.ax_heatmap.set(xticks=use_ticks, xticklabels=use_labels)
# g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=65, ha = 'right')
g.ax_heatmap.xaxis.tick_bottom()
# ax.invert_yaxis()
g.ax_heatmap.xaxis.label.set_color('black')
g.ax_heatmap.set_xticklabels(["HGP", "LSAC", "HapMap", "ENCODE", "modENCODE"])
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, fontsize = 16)
plt.show()

In [282]:

labels = ["D023421", "D004351", "D006790", "D010802", "D040641", "D012098", "D049750", "D049750", "D044822",\
         "D016324", "D004251", "D015183", "D017403",\
         "D002874", "D022202", "D002877", "D015894", \
         "D010641", "D014644", "D020641", "D006239", "D005838", \
         "D055106", 'd016680',  \
         "D000073336", "D017421",\
         "D050436", "D011401", "D046228",\
          "D015870", "D059467", \
         "D012399","D053263", \
         "D016364", "D059646", "D003433", "D000081246", "D016384"]

E_labels = tree.loc[(tree.ui.isin(labels)) & (tree.starting.isin(["E"]))].ui.unique()

G_labels = tree.loc[(tree.ui.isin(labels)) & (tree.starting.isin(["G"])) & (~tree.starting.isin(["E"]))].ui.unique()

In [335]:
labels = [
         "usda", "fgi", "nsf", "nchgr", "genbank", "embl", "bcm",\
         "congress", "white house", "doe", "lawrence liver more", "hugo", "celera", \
         "cshl", "chi nese academy of sciences", "unesco", "whitehead", "howard u",\
         "the snps consortium","high international hapmap consortium", \
         "ashg",  \
         "uwash", "stanford", "national cell culture resource center", "nimblegen", "niaid", "the broad institute", "sanger"]
use_labels = [
             "USDA", "FGI", "NSF", "NCHGR", "GenBank", "EMBL", "BCM",\
             "Congress", "White House", "DOE", "LLNL", "HUGO", "Celera",\
             "CSHL", "Chinese Academy of Sciences", "UNESCO", "Whitehead", "Howard Univ.", \
             "SNP Consortium",  "HapMap Consortium",\
             "ASHG", \
             "UWash", "Stanford", "NCCC", "Roche Nimblegen", "NIAID", "Broad", "Sanger"]


In [336]:
reordered = matrix_df.iloc[g.dendrogram_row.reordered_ind].reset_index()['index'].to_frame()

orgs =  [(i[0], i[1]) for i in \
    sorted(zip(labels, use_labels, [reordered.loc[reordered['index'].str.lower().str.strip() == label.lower()].index[0] for label in labels]), \
           key = lambda x: x[1])]


labels = [  "j craig venter",  "james d watson", "harold e varmus",\
         "elke jordan", "eric s lander", \
         "francis s collins",\
          "eric d green"
        ]
use_labels = [ "Venter",  "Watson",  "Varmus",\
              "Elke Jordan", "Lander", \
             "Collins",\
              "Green"
             ]

people =  [(i[0], i[1]) for i in \
    sorted(zip(labels, use_labels, [reordered.loc[reordered['index'].str.lower().str.strip() == label.lower()].index[0] for label in labels]), \
           key = lambda x: x[1])]

E_labels = [i[0] for i in \
    sorted(zip(E_labels, [reordered.loc[reordered['index'].str.lower().str.strip() == label.lower()].index[0] for label in E_labels]), \
           key = lambda x: x[1])]



G_labels = [i[0] for i in \
    sorted(zip(G_labels, [reordered.loc[reordered['index'].str.lower().str.strip() == label.lower()].index[0] for label in G_labels]), \
           key = lambda x: x[1])]

In [None]:
orgs

In [None]:
fig, (ax1, space1, ax2, space2, ax3, space3, ax4, space4) = plt.subplots(8, 1, gridspec_kw={'height_ratios': [1,2, 1, 2, 1,2, 1, 2]}, figsize=(24, 10), dpi = 300)

lines = sorted(Counter(clusters).items())
cumsum = 0
for line in lines:
    y = np.zeros(line[1])
    x = np.arange(cumsum, cumsum + y.shape[0])
    cumsum += line[1]
    
    ax1.plot(x, y, color = pal[line[0]-1], solid_joinstyle = "miter", solid_capstyle = 'butt')
    # l1.set_join_capstyle('miter')
    ax2.plot(x, y, color = pal[line[0]-1], solid_joinstyle = "miter", solid_capstyle = 'butt')
    ax3.plot(x, y, color = pal[line[0]-1], solid_joinstyle = "miter", solid_capstyle = 'butt')
    ax4.plot(x, y, color = pal[line[0]-1], solid_joinstyle = "miter", solid_capstyle = 'butt')
ax1.spines['right'].set_linewidth(0)
ax1.spines['top'].set_linewidth(0)
ax1.spines['bottom'].set_linewidth(0)
ax1.spines['left'].set_linewidth(0)
ax1.get_yaxis().set_ticks([])

ax1.spines['bottom'].set_position('zero')


space1.spines['right'].set_linewidth(0)
space1.spines['top'].set_linewidth(0)
space1.spines['bottom'].set_linewidth(0)
space1.spines['left'].set_linewidth(0)
space1.axis('off')

space2.spines['right'].set_linewidth(0)
space2.spines['top'].set_linewidth(0)
space2.spines['bottom'].set_linewidth(0)
space2.spines['left'].set_linewidth(0)
space2.axis('off')

ax2.spines['right'].set_linewidth(0)
ax2.spines['top'].set_linewidth(0)
ax2.spines['bottom'].set_linewidth(0)
ax2.spines['left'].set_linewidth(0)
ax2.get_yaxis().set_ticks([])
ax2.spines['bottom'].set_position('zero')

space3.spines['right'].set_linewidth(0)
space3.spines['top'].set_linewidth(0)
space3.spines['bottom'].set_linewidth(0)
space3.spines['left'].set_linewidth(0)
space3.axis('off')
    

ax3.spines['right'].set_linewidth(0)
ax3.spines['top'].set_linewidth(0)
ax3.spines['bottom'].set_linewidth(0)
ax3.spines['left'].set_linewidth(0)
ax3.get_yaxis().set_ticks([])
ax3.spines['bottom'].set_position('zero')


space4.spines['right'].set_linewidth(0)
space4.spines['top'].set_linewidth(0)
space4.spines['bottom'].set_linewidth(0)
space4.spines['left'].set_linewidth(0)
space4.axis('off')
    

ax4.spines['right'].set_linewidth(0)
ax4.spines['top'].set_linewidth(0)
ax4.spines['bottom'].set_linewidth(0)
ax4.spines['left'].set_linewidth(0)
ax4.get_yaxis().set_ticks([])
ax4.spines['bottom'].set_position('zero')


temp = mesh.loc[mesh.qualifier == "MH"]
temp['UI'] = temp['UI'].str.lower()
translate_mesh = temp.set_index("UI")['value'].to_dict()

# Add labels to specific points
# labels = [ "doe", "hugo", "white house", "celera", "lawrence liver more", "herac", \
#            "international aphid genomics consortium",  "cge", "fgi",\
#          "usda", "nachgr", "niaid", "genbank", \
#           "high international hapmap consortium", "the snp consortium ltd",\
#          "encode", "geneva","national cell culture resource center",\
#          "cit"]
# use_labels = ["DOE", "HUGO", "White House", "Celera", "LLNL", "HERAC",\
#               "IAGC", "CGE", "FGI",\
#              "USDA", "NACHGR", "NIAID", "GenBank",  \
#              "HapMap consortium", "SNP consortium",\
#              "ENCODE", "GENEVA", "NCCC",\
#              "NIH CIT"]



reordered = matrix_df.iloc[g.dendrogram_row.reordered_ind].reset_index()['index'].to_frame()


ax2.tick_params(axis="x", direction='in', length=7)

ax2_bottom = ax2.secondary_xaxis("bottom")
ax2_bottom.tick_params(axis="x", direction="out", length=7)
ticks = []
bottom_ticks = []
for i, label in enumerate(orgs):
    print(label)
    if i%2 == 0:
        x = reordered.loc[reordered['index'].str.lower().str.strip() == label[0].lower()]
        # ax1.axline((reordered.loc[reordered['index'].str.lower().str.strip() == label].index[0], 0.1), (reordered.loc[reordered['index'].str.lower().str.strip() == label].index[0],0.01) )  
        
        ax2.text(x.index[0], 0.02, label[1], fontsize=12, ha='left',\
                rotation=90, rotation_mode = "anchor",horizontalalignment='left', verticalalignment='top',color = pal[get_color(x['index'].iloc[0])])
        ticks.append(x.index[0])    
    else:
        x = reordered.loc[reordered['index'].str.lower().str.strip() == label[0].lower()]
        # ax1.axline((reordered.loc[reordered['index'].str.lower().str.strip() == label].index[0], -0.1), (reordered.loc[reordered['index'].str.lower().str.strip() == label].index[0], 0.01)) 
        ax2.text(x.index[0], -0.02, label[1], fontsize=12, ha='right',\
                rotation=90, rotation_mode = "anchor",horizontalalignment='right', verticalalignment='bottom',color = pal[get_color(x['index'].iloc[0])] )
        bottom_ticks.append(x.index[0])

ax2.set_xticks(ticks,  [])
ax2_bottom.set_xticks(bottom_ticks, [])
# ax2.xaxis.set_tick_params(length=7)


use_labels = [f"{translate_mesh[i.lower()]}" for i in E_labels]

reordered = matrix_df.iloc[g.dendrogram_row.reordered_ind].reset_index()['index'].to_frame()

label_up = True
ticks = []
for i, label in enumerate(E_labels):
    x = reordered.loc[reordered['index'].str.lower().str.strip() == label.lower()]

    if label == 'd055106':
        use_label = 'GWAS'
    else:
        use_label = use_labels[i]
    if label_up:
        ax3.text(x.index[0], 0.02, use_label, fontsize=12, ha='left',\
                rotation=90, rotation_mode = "anchor",horizontalalignment='left', verticalalignment='top',color = pal[get_color(x['index'].iloc[0])] )
    else:
        ax3.text(x.index[0], -0.02, use_label, fontsize=12, ha='right',\
                rotation=90, rotation_mode = "anchor",horizontalalignment='right', verticalalignment='bottom',color = pal[get_color(x['index'].iloc[0])] )
    label_up = not label_up
    ticks.append(x.index[0])

ax3.set_xticks(ticks,  [])
ax3.xaxis.set_tick_params(length=7)


ticks = []
for i, label in enumerate(people):
    print(label)
    if i%2 == 0:
        x = reordered.loc[reordered['index'].str.lower().str.strip() == label[0]]
        ax1.axline((reordered.loc[reordered['index'].str.lower().str.strip() == label[0]].index[0], 0.1), (reordered.loc[reordered['index'].str.lower().str.strip() == label[0]].index[0],0.01) )  
        
        ax1.text(x.index[0], 0.02, label[1], fontsize=12, ha='left',\
                rotation=90, rotation_mode = "anchor",horizontalalignment='left', verticalalignment='top',color = pal[get_color(x['index'].iloc[0])])
    else:
        x = reordered.loc[reordered['index'].str.lower().str.strip() == label[0]]
        ax1.axline((reordered.loc[reordered['index'].str.lower().str.strip() == label[0]].index[0], -0.1), (reordered.loc[reordered['index'].str.lower().str.strip() == label[0]].index[0], 0.01)) 
        ax1.text(x.index[0], -0.02, label[1], fontsize=12, ha='right',\
                rotation=90, rotation_mode = "anchor",horizontalalignment='right', verticalalignment='bottom',color = pal[get_color(x['index'].iloc[0])] )
    ticks.append(x.index[0])

ax1.set_xticks(ticks,  [])
ax1.xaxis.set_tick_params(length=7)



use_labels = [f"{translate_mesh[i.lower()]}" for i in G_labels]
reordered = matrix_df.iloc[g.dendrogram_row.reordered_ind].reset_index()['index'].to_frame()
ticks = []
label_up = True
for i, label in enumerate(G_labels):
    x = reordered.loc[reordered['index'].str.lower().str.strip() == label.lower()]

    if label == "d040641":
        use_label = "QTLs"
    elif label == "d022202":
        use_label = "BACs"
    elif label == "d018244":
        use_label = "YACs"
    else:
        use_label = use_labels[i]
    if label_up:
        ax4.text(x.index[0], 0.02, use_label, fontsize=12, ha='left',\
                rotation=90, rotation_mode = "anchor",horizontalalignment='left', verticalalignment='top', color = pal[get_color(x['index'].iloc[0])]  )
    else:
        ax4.text(x.index[0], -0.02, use_label, fontsize=12, ha='right',\
                rotation=90, rotation_mode = "anchor",horizontalalignment='right', verticalalignment='bottom',color = pal[get_color(x['index'].iloc[0])]  )
    label_up = not label_up

    ticks.append(x.index[0])

ax4.set_xticks(ticks,  [])
ax4.xaxis.set_tick_params(length=7)



# plt.tight_layout()
plt.show()

In [253]:
terms['category'] = terms['term'].map(categories_dict).fillna("org")

In [None]:
terms.shape

In [380]:
def prepare_for_export(x):

    if x['term'] in ["eric s lander", "francis s collins", "james d watson", 'elke jordan', 'harold e varmus', 
         'bill clinton', 'j craig venter']:
        return x['term']
    elif x['category'] == "people":
        return "PERSON"
    elif x['category'] == "phenomena" or x['category'] == "technique":
        return x['term'].upper() + " (" + x['mesh_term'].strip() + ")"
    else:
        return x['term']
        

In [381]:
terms['term'] = terms.apply(lambda x: prepare_for_export(x), axis = 1)

In [382]:
terms = terms.loc[terms.term != ""]
terms = terms.sort_values('cluster').groupby(["category", "cluster"]).agg({"term": list}).reset_index().explode('term')

In [383]:
terms[["cluster", "category", "term"]].to_csv("../cache/significant_keywords.csv", index = False)

In [None]:
terms.groupby("category").count()

In [None]:
terms.loc[terms.category == "technique"]

In [None]:
len(all_essential_terms)

In [None]:
all_essential_terms = []

for i in [("HGP", "sequence"), ("HGP", "HapMap"), ("HGP", "modENCODE"), ("HGP", "ENCODE")]:
    temp = contain_df[i]
    all_essential_terms.append((temp.loc[(temp.rejected) & (temp.powerful)]['word'].str.lower().to_list(), i[1]))
all_essential_terms = pd.DataFrame(all_essential_terms, columns = ["word", "project"]).explode("word")
all_essential_terms['category'] = all_essential_terms['word'].map(categories_dict)
all_essential_terms = all_essential_terms[all_essential_terms.category == "technique"]
all_essential_terms.word.unique().shape

In [None]:
description = nlm.mesh('descriptor')
description['UI'] = description['UI'].str.lower()
dates_of_mesh = description.loc[(description.UI.isin(all_essential_terms['word'].to_list())) & (description.qualifier == "DA")]
dates_of_mesh['value'] = dates_of_mesh['value'].apply(lambda x: datetime.strptime(x, "%Y%m%d")).dt.year


In [None]:
documents_with_essential_terms = all_merged.explode("mesh")
documents_with_essential_terms['mesh'] = documents_with_essential_terms['mesh'].str.lower()
documents_with_essential_terms = pd.merge(documents_with_essential_terms, all_essential_terms, left_on = 'mesh', right_on = 'word')
documents_with_essential_terms[['mesh', 'docID', 'project']].drop_duplicates()

In [None]:

dates = pd.read_parquet("../cache/pdfs_word_excel_powerpoint_031924_with_dates_for_all.parquet")
dates_merged = pd.merge(documents_with_essential_terms[['mesh', 'docID', 'project']].drop_duplicates(), dates, left_on = "docID", right_on = "nodeID")
dates_merged['year'] = dates_merged['date'].dt.year
dates_merged.mesh.unique().shape

In [426]:
mesh_to_earliest_doc_year = dates_merged.sort_values(['mesh', 'year']).groupby('mesh').head(1).set_index('mesh')['year'].to_dict()
mesh_to_entry_year = dates_of_mesh.set_index("UI")['value'].to_dict()
all_essential_terms['doc_year'] = all_essential_terms['word'].map(mesh_to_earliest_doc_year)
all_essential_terms['mesh_year'] = all_essential_terms['word'].map(mesh_to_entry_year)

In [427]:
all_essential_terms['diff'] = all_essential_terms['doc_year'] - all_essential_terms['mesh_year']

In [None]:
all_essential_terms.loc[all_essential_terms['diff'] >= 0 ].word.unique().shape

In [None]:
all_essential_terms.loc[all_essential_terms['diff'] < 0 ].word.unique().shape

In [432]:
start_of_each_project = {
"ENCODE": 2003,
"modENCODE": 2003,
"sequence": 2002,
"HapMap": 2002
}

`../cache/separate_genomic_techniques_determined.csv` is provided in the repository!

In [433]:
determined_genetic_techniques = pd.read_csv("../cache/separate_genomic_techniques_determined.csv")[['name', 'mesh_year', 'doc_year', 'count', 'genomic']]
determined_genetic_techniques['diff'] = determined_genetic_techniques['doc_year'] - determined_genetic_techniques['mesh_year']

In [440]:
just_techniques = all_essential_terms[
    all_essential_terms['category'] == "technique"
]

just_techniques['name'] = just_techniques['word'].apply(lambda x: translate_mesh[x])

In [441]:
just_techniques['genomic'] = just_techniques['name'].map(determined_genetic_techniques.set_index("name")['genomic'].to_dict())

In [442]:
just_techniques['project_year'] = just_techniques['project'].map(start_of_each_project)

In [None]:
just_techniques.word.unique().shape

In [445]:
temp = just_techniques

In [446]:
temp['doc_year'] = temp.apply(lambda x: x['doc_year'] if x['word'] != "d000081246" else 2009, axis = 1)
temp['diff_project'] = temp.apply(lambda x: x['doc_year'] - x['project_year'], axis = 1)

In [None]:
temp.loc[temp.genomic].word.unique().shape

In [None]:
temp.loc[(temp.genomic) & (temp.diff_project < 0)].word.unique().shape

In [None]:
temp.loc[(temp.genomic) & (temp.diff_project >= 0)].word.unique().shape

# Figure 2D

In [None]:
# modENCODE: "#AD5D95"
# ENCODE: "#095393"
# LSAC: "#51AF4D"
# HapMap: "#E1BE15"

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5


matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
"legend.fontsize": 10,
"font.size":7})

fig, ax = plt.subplots(figsize=(4, 2), dpi=300)


# sns.violinplot(temp.sort_values(by=['project'],
#                 key=lambda x: x.map({"HGP": 0, "sequence": 2, "HapMap": 1, "ENCODE": 3, "modENCODE": 4})), 
#                x = 'diff_project', y = 'project', hue = 'project', ax = ax, dodge=False, linecolor ='k', linewidth = 0.2, alpha = 0.1,
#               palette = ["#E1BE15", "#51AF4D", "#095393", "#AD5D95"] )

# for violin in ax.collections[::2]:
#     violin.set_alpha(0.5)

sns.swarmplot(data = temp.loc[(temp.word == "d055106") & (temp.genomic)].sort_values(by=['project'],
                key=lambda x: x.map({"HGP": 0, "sequence": 2, "HapMap": 1, "ENCODE": 3, "modENCODE": 4})), x = 'diff_project', y = 'project', ax = ax, palette = ["red", "red", "red", "red"], size = 2, )

sns.swarmplot(data = temp.loc[(temp.word != "d055106") & (temp.genomic)].sort_values(by=['project'],
                key=lambda x: x.map({"HGP": 0, "sequence": 2, "HapMap": 1, "ENCODE": 3, "modENCODE": 4})), x = 'diff_project', y = 'project', ax = ax, palette = ['k', 'k','k','k'], size = 2, )


# sns.swarmplot(temp.loc[temp.diff_project <= 0], x = 'diff_project', y = 'project',ax = ax, palette = ['black'], size = 2, )
ax.legend().remove()
ax.tick_params(axis='x', colors='black')
ax.yaxis.label.set_color('black')
ax.xaxis.label.set_color('black')
ax.tick_params(axis='y', colors='black')
ax.spines['bottom'].set_linewidth(0.5)
ax.spines['left'].set_linewidth(0.5)
sns.despine()
ax.set_ylabel("")
ax.axvline(0, 0, 10, c=  'k', linewidth = 0.5, linestyle = '--')
ax.set_xlabel("")

# ax.set_yticklabels(["HapMap", "LSAC", "ENCODE", "modENCODE"])

ax.set_xlim(-40, 30)
ax.set_xlabel("Year of appearance relative to project", fontsize = 10)
ax.set_xticks(np.arange(-40, 30, 10))
plt.tight_layout()
plt.savefig('../cache/genomic_techniques_project_start.pdf')


