In [None]:
import json

import sys
sys.path.append('../src/')

from utils_tiramisu import *

import re
from dateutil import parser
import dateparser
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

from datetime import datetime

We first extract the dates of MS Office documents. Due to potential corruption during template and saving times, we take the median date in the MS Office metadata.

In [None]:
def get_date(path):
    all_dates = []
    with open(path, 'r') as f:
        data = json.load(f)
    
    if 'dcterms:created' in data['metadata']:
        if isinstance(data['metadata']['dcterms:created'], list):
            try:
                all_dates.extend([parser.parse(i).date() for i in data['metadata']['dcterms:created']])
            except:
                print(i)
                print(data['metadata']['dcterms:created'])
        else:
            all_dates.append(parser.parse(data['metadata']['dcterms:created']).date())
    if 'custom:_DCDateCreated' in data['metadata']:
        if isinstance(data['metadata']['custom:_DCDateCreated'], list):
            all_dates.extend([parser.parse(i).date() for i in data['metadata']['custom:_DCDateCreated']])
        else:
            all_dates.append(parser.parse(data['metadata']['custom:_DCDateCreated']).date())
    
    if "dcterms:modified" in data['metadata']:
        if isinstance(data['metadata']['dcterms:modified'], list):
            all_dates.extend([parser.parse(i).date() for i in data['metadata']['dcterms:modified']])
        else:
            all_dates.append(parser.parse(data['metadata']['dcterms:modified']).date())
    return sorted(all_dates)[len(all_dates)//2]

In [None]:
# get date extractable documents from corpus

all_ms = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['doc', 'docx', 'ppt', 'pptx', 'xlsx', 'xls'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as file_extension
""")

all_pdfs = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:CONVERT_TO] -> (f:File) 
where e.fileExtension = 'pdf' and f.fileExtension = 'png' 
return c.nodeID as nodeID, e.originalPath as path, e.fileExtension as file_extension
""")

folder_structure = pd.concat([all_pdfs, all_ms])

all_ms

The following folders are created by doing the text extraction from the [start_here.ipynb](../start_here.ipynb).

In [None]:
all_dates = []
date_types = []
for i, row in all_ms.iterrows():
    date = get_date(f"../text_extraction/ms_tika/{row['nodeID']}.json")

    if date.year <= 2015:
        all_dates.append(date)
        date_types.append(row['file_extension'])
    

We ignored those from 2016 and onwards; some are checked to have corrupted dates in the MS files due to potential saving by staff.

In [None]:
date_df = pd.DataFrame({"date": all_dates, "type": date_types})
date_df = date_df.sort_values('date')
date_df

In [None]:
date_df['type'] = date_df['type'].map({"ppt": "powerpoint", "doc": "word", "pptx": "powerpoint", "docx": "word", "xls": "excel", "xlsx": "excel"})

We now extract the dates from PDF documents.

`../cache/pdfs_word_excel_powerpoint_010924.parquet` is simply a Pandas DataFrame that contains the combined texts of the scanned/electronic PDFs and MS documents. The columns are `text`, which is the raw text, and `nodeID` which is the nodeIDs of the split single-page PDFs or the MS documents.

In [None]:
# first load all the PDF text segregated by documents

# this is the compilation of all of the extracted text
together = pd.read_parquet(
    "../cache/pdfs_word_excel_powerpoint_010924.parquet"
)



map_nodeID_to_page = map_nodeID_to_docID.set_index('nodeID').to_dict()['page']
# map_nodeID_to_path = map_nodeID_to_docID.set_index("nodeID").to_dict()['path']
map_nodeID_to_docID = map_nodeID_to_docID.set_index('nodeID').to_dict()['documentID']

together['docID'] = together['nodeID'].apply(lambda x: map_nodeID_to_docID[x] if x in map_nodeID_to_docID else x)
together['page'] = together['nodeID'].apply(lambda x: map_nodeID_to_page[x] if x in map_nodeID_to_page else 0)
together = pd.merge(together, folder_structure, left_on = 'nodeID', right_on = 'nodeID')

together['text'] = together['text'].apply(lambda x: x + " ")

together = together.sort_values(['docID', 'page']).groupby('docID').agg({"text": "sum", "path": set}).reset_index()

together['path'] = together['path'].apply(lambda x: list(x)[0])
together['text'] = together['text'].str.lower()

In [None]:
# mm/dd/yyyy
# mm/dd/yy
# month day, year
# month abreviation day, year 
# mm.dd.yyyy
## handles st/nd/rd

patternone = re.compile("(((1[0-2]|0?[1-9])(\/|-|\.)(3[01]|[12][0-9]|0?[1-9])(\/|-|\.)(?:[0-9]{2})?[0-9]{2})|((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)\s+\d{1,2}(st|nd|rd)?,?\s+\d{4}))",re.IGNORECASE )



# yyyy/mm/dd
patterntwo = re.compile("((19[7-9][0-9]|20[0-9]{2})/((0?[13578]|1[02])/(0?[1-9]|[12][0-9]|3[01])|(0?[469]|11)/(0?[1-9]|[12][0-9]|30)|0?2/(0?[1-9]|1[0-9]|2[0-8]))|(19([79][26]|8[048])|20([02468][048]|[13579][26]))/0?2/29)", re.IGNORECASE)



## day month, year

patternthree = re.compile("((\d{1,2})(st|nd|rd)?\s+((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?),?(\s+\d{4})))", re.IGNORECASE)


In [None]:
def get_date_pdf(text):
    returned = []
    
    
    for match in patternone.findall(text):
        date = dateparser.parse(match[0])

        # while we don't anticipate dates beyond 2016, keep the year truncated at 2020
        # so that we can review how well our date extraction falls within reasonable time period
        if date is not None:
            if date.year < 1980 or date.year > 2020:
                pass
            else:
                returned.append(date)
        else:
            pass
            
    for match in patterntwo.findall(text):
        date = dateparser.parse(match[0])
        
        if date is not None:
            if date.year < 1980 or date.year > 2020:
                pass
            else:
                returned.append(date)
        else:
            pass

    for match in patternthree.findall(text):
        
        date = dateparser.parse(match[0])
        
        if date is not None:
            if date.year < 1980 or date.year > 2020:
                pass
            else:
                returned.append(date)
        else:
            pass

    return returned

In [None]:
together['dates'] = together['text'].progress_apply(get_dates)

We get the dates from PDF by the following rules
- only take dates in the first two pages (avoids references, and most of the time, the date of the document should be in the first two pages)
- only consider documents whose dates fall within one month of one another. since our granularity if month/year, we only care about the month level detail.
- get the median of valid dates.

In [None]:
def within_one_month(listofdates):
    min_date=  min(listofdates)
    max_date= max(listofdates)

    if relativedelta(pd.Timestamp(max_date), pd.Timestamp(min_date)).years == 0 and 
    relativedelta(pd.Timestamp(max_date), pd.Timestamp(min_date)).months <= 1:
        return True
    else:
        return False

def get_median_date(date_list):
    return sorted(date_list)[len(date_list)//2]


In [None]:
merged = pd.merge(together, folder_structure, on = 'nodeID')

pdfs_with_paths = merged.loc[merged.documentID.notna()]

together = pd.concat([merged.loc[merged.fileExtension != 'pdf'], pdfs_with_paths])
together['documentID'] = together.apply(lambda x: x['nodeID'] if x['documentID'] is None else x['documentID'], axis = 1)

together["text"] = together['text'].apply(lambda x: x + " ")
together = together.sort_values(['documentID', 'page'])


# take the dates in the first two pages
per_document = together.loc[together.fileExtension == 'pdf'].groupby('documentID').head(2)

per_document = per_document.reset_index().groupby('documentID').agg({"fileExtension": "first", "text": sum,  "dates" : list})

per_document['dates'] = per_document['dates'].apply(lambda x: list(chain(*x)))

valid_pdf = per_document.loc[(per_document.fileExtension == "pdf") & (per_document.dates.str.len() > 0)]

valid_pdf['filtered_month'] = valid_pdf['dates'].apply(within_one_month)
valid_pdf['median_date'] = valid_pdf['dates'].apply(get_median_date)

valid_pdf_by_month = valid_pdf.loc[valid_pdf.filtered_month].reset_index()

valid_ms = pd.merge(date_df, nhgri_with_dates, on = 'nodeID', how = 'left')[['date', 'type', 'nodeID', 'text']]
valid_ms['date'] = valid_ms['date'].apply(lambda x: pd.Timestamp(x))

valid_pdf_by_month['date'] = valid_pdf_by_month['median_date'].apply(lambda x: \
                                        datetime.datetime(pd.Timestamp(x).year, pd.Timestamp(x).month, 1))

valid_pdf_by_month.columns = ['nodeID', 'type', 'text', 'dates', 'filtered_month', 'median_date', 'date']

all_documents_with_dates = pd.concat([valid_pdf_by_month[['type', 'text', 'date', 'nodeID']], valid_ms[['date', 'type', 'text', 'nodeID']]])



In [None]:
# percentage of documents with a valid date

all_documents_with_dates.shape[0] / 22843

In [None]:

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})
from matplotlib.ticker import MaxNLocator, MultipleLocator, PercentFormatter

matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 7,
"font.size":7})

SMALL_SIZE = 7
MEDIUM_SIZE = 10
BIGGER_SIZE = 12

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize= SMALL_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=SMALL_SIZE)  # fontsize of the figure title


matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2


fig, ax = plt.subplots(2, 1, figsize = (4, 4), dpi = 300, gridspec_kw={'height_ratios': [2, 1]} )

ax[0].grid(True, which = 'major', axis = 'x')
ax[0].set_yticks([])
ax[0].spines['right'].set_visible(False)
ax[0].spines['top'].set_visible(False)
ax[0].spines['bottom'].set_color('black')
ax[0].spines['left'].set_color('black')
ax[0].xaxis.label.set_color('black')
ax[0].tick_params(axis='x', colors='black')
ax[0].yaxis.label.set_color('black')
ax[0].tick_params(axis='y', colors='black')
ax[0].spines['bottom'].set_linewidth(0.5)
ax[0].spines['left'].set_linewidth(0.5)

ax[0].set_xlim([datetime.date(1988, 1, 1), datetime.date(2020, 1, 1)])
ax[0].yaxis.set_tick_params(labelleft=False)




ax[1].spines['right'].set_visible(False)
ax[1].spines['top'].set_visible(False)
ax[1].spines['bottom'].set_color('black')
ax[1].spines['left'].set_color('black')
ax[1].xaxis.label.set_color('black')
ax[1].tick_params(axis='x', colors='black')
ax[1].yaxis.label.set_color('black')
ax[1].tick_params(axis='y', colors='black')
ax[1].spines['bottom'].set_linewidth(0.5)
ax[1].spines['left'].set_linewidth(0.5)

bins = pd.date_range(start='1988-1-1',
                  end='2020-1-1',
                  periods=35)
ax[1].set_xlim([datetime.date(1988, 1, 1), datetime.date(2020, 1, 1)])
ax[1].set_ylabel("Documents")
ax[1].yaxis.set_major_locator(MaxNLocator(prune='lower'))

ax[1].locator_params(axis='y', nbins=5)

pdf_hist = np.array(all_documents_with_dates.loc[all_documents_with_dates.type == 'pdf']['date'].to_list()).flatten()
word_hist = np.array(all_documents_with_dates.loc[all_documents_with_dates.type == 'word']['date'].to_list()).flatten()
ppt_hist = np.array(all_documents_with_dates.loc[all_documents_with_dates.type == 'powerpoint']['date'].to_list()).flatten()
excel_hist = np.array(all_documents_with_dates.loc[all_documents_with_dates.type == 'excel']['date'].to_list()).flatten()
ax[1].hist([pdf_hist,word_hist,ppt_hist, excel_hist], bins, stacked=True,  color = [cmap_key('pdf'), \
                cmap_key('word'), cmap_key('powerpoint'), cmap_key('excel')], linewidth = 0.3)

fig.tight_layout()
# plt.savefig('../cache/pdf-powerpoint-word-time-period-with-parta_240313.pdf', transparent = True, bbox_inches='tight')
plt.show()

As a temporal control, we track easy terms that have clear beginning and ending timelines.

In [None]:
terms = {"nhgri": [
    "nhgri", 
    "national human genome research institute"
],
         "nchgr": [
             "nchgr",
             "national center for human genome research"
         ],
         "ohgr": [
             "ohgr",
             "office of human genome research"
         ],
         # current director of NHGRI
         "green": [
             "eric green",
             "e green",
             "e. green",
             "green, eric",
             "green, e",
         ],
         # former director of NIH
         "varmus": [
             "varmus",
             "harold varmus",
             "h. varmus",
             "h varmus",
             "varmus, harold",
             "varmus, h",
             "varmus, h."
         ],
         # former director of NHGRI, HGP
         "collins": [
             "francis collins",
             "f. collins",
             "f collins",
             "collins, francis",
             "collins, f",
             "collins, f.",
             "fc",
         ],
         # former director of NIH
         "watson": [
             "watson",
             "j. watson",
             "j watson",
             "james watson",
             "watson, james",
             "watson, j",
             "watson, j."
         ],
         # private effort to sequence a human genome
         "celera": [
             "celera"
         ],
         # the five main sequencing centers
         "g5": [
             "g5",
              "baylor college of medicine",
             "bcm",
             "broad institute",
             "broad/mit",
              "whitehead",
             "whitehead/mit",
              "joint genome institute",
             "jgi",
               "washu",
             'wustl',
             "washington university in st. louis",
             "washington university in st louis",
             "washington university at st louis",
              "washington university at st. louis",
              "sanger institute",
             "wellcome sanger"
         ]
        }

In [None]:
all_documents_with_dates = all_documents_with_dates.loc[all_documents_with_dates.text.notna()]
all_documents_with_dates = all_documents_with_dates.loc[(all_documents_with_dates.date.dt.year >= 1988) & \
                                                       (all_documents_with_dates.date.dt.year <= 2012)]

In [None]:
list_of_entities = []


for i, row in tqdm(all_documents_with_dates.iterrows(), total = all_documents_with_dates.shape[0]):
    temp = []
    for group, term in enumerate(terms):
        
        matches = re.findall(r"\b(" + r"|".join([alias for alias in terms[term]]) + r")\b", row['text'].lower())
        if len(matches) > 0:
            list_of_entities.append((row['date'], True, term))
        else:
            list_of_entities.append((row['date'], False, term))
entities_df = pd.DataFrame(list_of_entities, columns = ["date", "entity", "text"])
entities_df['year'] = entities_df.date.dt.year



In [None]:
sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 7,
"font.size":7})
matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5

fig, ax = plt.subplots(3, 1, figsize = (60 * (1/2.54/10), 120 * (1/2.54/10)), dpi = 300 )



linewidth = 0.5
num = entities_df.loc[(entities_df.text == 'ohgr')]
sns.lineplot(data = num, x = 'year', y = 'entity', ax = ax[0], label = "OHGR" , color = "#FF8811",\
            estimator=lambda x: sum(x==1)*100.0/len(x), linewidth = linewidth )

ax[0].axvline(x = 1989, ymin =0, ymax = 1, color = 'black', alpha = 0.6, linestyle = "dashed", linewidth = linewidth )
ax[0].axvline(x = 1997, ymin =0, ymax = 1, color = 'black', alpha = 0.6, linestyle = "dashed", linewidth = linewidth)

num = entities_df.loc[(entities_df.text == 'nchgr')]
sns.lineplot(data = num, x = 'year', y = 'entity', ax = ax[0], label = "NCHGR" , color = "#C65B7C", \
             estimator=lambda x: sum(x==1)*100.0/len(x), linewidth = linewidth)


num = entities_df.loc[(entities_df.text == 'nhgri')]
sns.lineplot(data = num, x = 'year', y = 'entity', ax = ax[0], label = "NHGRI", color = "#4B2142", \
             estimator=lambda x: sum(x==1)*100.0/len(x), linewidth = linewidth)





num = entities_df.loc[(entities_df.text == 'watson')]
sns.lineplot(data = num, x = 'year', y = 'entity', ax = ax[1], label = "Watson",  color = "#FF8811",\
              estimator=lambda x: sum(x==1)*100.0/len(x), linewidth = linewidth)

num = entities_df.loc[(entities_df.text == 'collins')]
sns.lineplot(data = num, x = 'year', y = 'entity', ax = ax[1], label = "Collins" , color =  "#C65B7C",\
              estimator=lambda x: sum(x==1)*100.0/len(x), linewidth = linewidth)
num = entities_df.loc[(entities_df.text == 'green')]
sns.lineplot(data = num, x = 'year', y = 'entity', ax = ax[1], label = "Green" , color =  "#4B2142",\
              estimator=lambda x: sum(x==1)*100.0/len(x), linewidth = linewidth)

ax[1].axvline(x = 1993.33, ymin =0, ymax = 1, color = 'black', alpha = 0.6, linestyle = "dashed", linewidth = linewidth)
ax[1].axvline(x = 2008.67, ymin =0, ymax = 1, color = 'black', alpha = 0.6, linestyle = "dashed", linewidth = linewidth)



ax[2].axvline(x = 2001.17, ymin =0, ymax = 1, color = 'black', alpha = 0.6, linestyle = "dashed", linewidth = linewidth)
num = entities_df.loc[(entities_df.text == 'celera')]
sns.lineplot(data = num, x = 'year', y = 'entity', ax = ax[2], label = "Celera", color =  "#FF8811" ,\
              estimator=lambda x: sum(x==1)*100.0/len(x), linewidth = linewidth)
num = entities_df.loc[(entities_df.text == 'g5') | (entities_df.text == 'broad') |\
                      (entities_df.text == 'whitehead') | (entities_df.text == 'washu') | \
                     (entities_df.text == 'jgi') | (entities_df.text == 'sanger')]
sns.lineplot(data = num, x = 'year', y = 'entity', ax = ax[2], label = "G5", color =  "#C65B7C", \
              estimator=lambda x: sum(x==1)*100.0/len(x), linewidth = linewidth)

ax[0].spines['right'].set_linewidth(0)
ax[0].spines['top'].set_linewidth(0)
ax[1].spines['right'].set_linewidth(0)
ax[1].spines['top'].set_linewidth(0)
ax[2].spines['right'].set_linewidth(0)
ax[2].spines['top'].set_linewidth(0)



ax[0].set_ylabel("")
ax[0].set_ylim([0, 100])
ax[0].set_xlim([1988, 2012])
ax[1].set_ylabel("")
ax[1].set_ylim([0, 100])
ax[1].set_xlim([1988, 2012])
ax[2].set_ylabel("")
ax[2].set_ylim([0, 100])
ax[2].set_xlim([1988, 2012])
ax[0].set_xlabel("")
ax[1].set_xlabel("")
ax[2].set_xlabel("")

ax[0].get_legend().remove()
ax[1].get_legend().remove()
ax[2].get_legend().remove()
ax[0].minorticks_on()
ax[0].yaxis.set_tick_params(which='minor', bottom=True)
# ax[0].xaxis.tick_top()
ax[0].xaxis.set_ticks([1990, 2000, 2010])

ax[1].minorticks_on()
ax[1].yaxis.set_tick_params(which='minor', bottom=True)
# ax[1].xaxis.tick_top()
ax[1].xaxis.set_ticks([1990, 2000, 2010])

ax[2].minorticks_on()
ax[2].yaxis.set_tick_params(which='minor', bottom=True)
# ax[2].xaxis.tick_top()
ax[2].xaxis.set_ticks([1990, 2000, 2010])
# plt.savefig('../figures/figure_2_temporal_controls_1.pdf', transparent=True, dpi = 300,  bbox_inches='tight')
plt.show()

This is the boxenplot of dates corresponding to each project in SI Figure 8.

In [None]:

def projects(x):

    if x['secondary'] == 'human sequence':
        return "Human Genome Project"
    elif x['secondary'] == "Box026-010.pdf" and x['folders'] == "Large scale sequence":
        return "LSAC"
    elif x['folders'] == 'sequencingrampupfiles':
        return "Human Genome Project"
    elif x['folders'] == "eMERGE":
        return "eMERGE"
    elif x['folders'] == "PAGE":
        return "PAGE"
    elif x['folders'] == "ENCODE":
        return "ENCODE"
    elif x['folders'] == 'modENCODE':
        return 'modENCODE'
    elif x['folders'] == 'ELSI':
        return 'ELSI'
    elif x['folders'] == 'Celera':
        return "Human Genome Project"
    elif x['folders'] == "H3Africa":
        return "H3Africa"
    elif x['folders'] == 'Sequence target files':
        return "LSAC"
    elif x['folders'] == "Haplotype Map Project":
        return "HapMap"
    elif x['folders'] == "GWAS materials":
        return "GWAS"
    else:
        return x['folders']


In [None]:
colors = {'ELSI': '#F04A3B',
 'GWAS': '#9FB13A',
 'HapMap': '#E1BE15',
 'LSAC': '#51AF4D',
 'ENCODE': '#095393',
 'modENCODE': '#AC5D95',
 'eMERGE': 'maroon',
 'Human Genome Project': 'gray',
 'H3Africa': '#06B4DB',
 'PAGE': '#4A4EA1'}

In [None]:
pdfs = all_documents_with_dates.loc[all_documents_with_dates.type == "pdf"]
merged = pd.concat([pdfs, all_documents_with_dates.loc[all_documents_with_dates.type != 'pdf']])
merged = pd.merge(merged, folder_structure, on = 'nodeID')

merged['all_folders'] = merged['path'].apply(lambda x: Path(x).parts)
merged['folders'] = merged['all_folders'].apply(lambda x: x[2])
merged['secondary'] = merged['path'].apply(lambda x: x.split('/')[3] if len(x.split('/')) > 2 else None)
merged['tertiary'] = merged['path'].apply(lambda x: x.split('/')[4] if len(x.split('/')) > 4 else x.split('/')[3])
merged['project'] = merged.apply(lambda x: projects(x), axis = 1)
merged['color'] = merged['project'].map(colors)

merged['year'] = merged.date.dt.year

merged['order'] = merged['project'].map({
    "ELSI": 0,
    "Human Genome Project": 1,
    "HapMap": 3,
    "LSAC": 2, 
    "ENCODE": 4,
    "modENCODE": 5,
    "eMERGE": 6,
    "PAGE": 7,
    "GWAS":8,
    "H3Africa":9})
merged = merged.sort_values('order')

merged = merged.loc[merged.color.notna()]

In [None]:


sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False
matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 5,
"font.size":7})


matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5


fig, ax = plt.subplots(1, 1, figsize = (4 , 3), dpi = 300 )

sns.boxenplot(merged.loc[merged.project.notna()], x = 'year', y ='project', palette = sns.color_palette(merged.loc[merged.project.notna()].color.unique()), linewidth = 0.5, 
                  flier_kws={'marker': 'o', 's' : 1}, dodge = False)
ax.set_ylabel("")
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
ax.xaxis.label.set_color('black')
ax.tick_params(axis='x', colors='black')
ax.yaxis.label.set_color('black')
ax.tick_params(axis='y', colors='black')
ax.spines['bottom'].set_linewidth(0.5)
ax.spines['left'].set_linewidth(0.5)

ax.set_xlabel("")

plt.show()