In [None]:
import sys
sys.path.append('../src/')

from utils_tiramisu import *

from tqdm import tqdm

from pathlib import Path

# this is the same TIRAMISU_PATH as shown in start_here.ipynb
TIRAMISU_PATH = 

import pandas as pd

import pickle
import gensim
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)



In [None]:
import matplotlib
sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False
matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 5,
"font.size":7})


matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5

In [None]:
from openTSNE import TSNE
from openTSNE import affinity, initialization, TSNEEmbedding
from openTSNE.affinity import Affinities
import scipy as sp
from scipy import sparse

In [None]:
def mycallback(iteration, error, embedding):
    Zs.append(embedding.copy())
    kls.append(error)
    n_iter.append(iteration)
    
    
def projects(x):

    if x['secondary'] == 'human sequence':
        return "Human Genome Project"
    elif x['secondary'] == "Box026-010.pdf" and x['folders'] == "Large scale sequence":
        return "LSAC"
    elif x['folders'] == 'Large scale sequence':
        return ""
    elif x['folders'] == 'sequencingrampupfiles':
        return "Human Genome Project"
    elif x['folders'] == "eMERGE":
        return "eMERGE"
    elif x['folders'] == "PAGE":
        return "PAGE"
    elif x['folders'] == "ENCODE":
        return "ENCODE"
    elif x['folders'] == 'modENCODE':
        return 'modENCODE'
    elif x['folders'] == 'ELSI':
        return 'ELSI'
    elif x['folders'] == 'Celera':
        return "Human Genome Project"
    elif x['folders'] == "H3Africa":
        return "H3Africa"
    elif x['folders'] == 'Sequence target files':
        return "LSAC"
    elif x['folders'] == "Haplotype Map Project":
        return "HapMap"
    elif x['folders'] == "GWAS materials":
        return "GWAS"
    else:
        return x['folders']

colors = {'ELSI': '#F04A3B',
 'GWAS': '#9FB13A',
 'HapMap': '#E1BE15',
 'LSAC': '#51AF4D',
 'ENCODE': '#095393',
 'modENCODE': '#AC5D95',
 'eMERGE': 'maroon',
 'Human Genome Project': 'black',
 'H3Africa': '#06B4DB',
 'PAGE': '#4A4EA1'}

In [None]:
together = pd.read_parquet(f'../models/stopwords_filtering/filtered_text_240320_{0.1}_removed_22070.parquet')

len(together[['text', 'nodeID']].set_index('nodeID').groupby('nodeID').apply(lambda x : x.to_numpy().tolist()).to_dict())

In [None]:
all_pdfs = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:CONVERT_TO] -> (f:File) 
where e.fileExtension = 'pdf' and f.fileExtension = 'png' 
return c.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

all_ms = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['doc', 'docx', 'ppt', 'pptx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")


for fraction in tqdm([0, 0.01, 0.05, 0.1, 0.2, 0.5]):

    # this cached file is created in `doc2Vec_remove_stopwords.ipynb`
    together = pd.read_parquet(f'../models/stopwords_filtering/filtered_text_240320_{fraction}_removed_22070.parquet')

    to_put_into_gensim = together[['text', 'nodeID']].set_index('nodeID').groupby('nodeID').apply(lambda x : x.to_numpy().tolist()).to_dict()
    
    corpus_list = []

    for document in tqdm(to_put_into_gensim, total = len(to_put_into_gensim)):
        temp_corpus = ""
        temp_key = ""
        for i in to_put_into_gensim[document]:
            temp_corpus = temp_corpus + i[0] + " "
            temp_key = document
        corpus_list.append(gensim.models.doc2vec.TaggedDocument([token.strip() for token in temp_corpus.split()], [temp_key]))
        
    model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=50, workers = 10)
    model.build_vocab(corpus_list)
    model.train(corpus_list, total_examples=model.corpus_count, epochs=model.epochs)
    model.save(f'../models/stopwords_filtering/doc2vec-model-filtered-text-{fraction}-removed-on-010924-corpus_22070')
    
    indices = []
    columns = []
    similarity_list = []
    for i in range(len(corpus_list)):
        doc_id = corpus_list[i].tags[0]
        columns.append(doc_id)
        inferred_vector = model.infer_vector(corpus_list[i].words)
        sims = model.dv.most_similar([inferred_vector], topn=len(corpus_list))


        similarities = []
        docids = []
        for docid, sim in sims:
            similarities.append(sim)
            docids.append(docid)
        similarity_list.append(similarities)
        indices.append(docids)
    dfs_to_concat = []

    for i in range(len(indices)):
        df = pd.DataFrame(similarity_list[i], columns = [columns[i]], index = indices[i])
        dfs_to_concat.append(df)
    maps = pd.concat(dfs_to_concat, axis = 1)
    with open(f'../models/stopwords_filtering/map-filtered-text-{fraction}-removed-on-010924-corpus_22070.pkl', 'wb') as f:
        pickle.dump(maps, f)
    maps = maps.sort_index(axis = 0)
    maps = maps.sort_index(axis = 1)
    maps = maps.T
    maps= maps.rename_axis('path').reset_index()

    maps['all_folders'] = maps['path'].apply(lambda x: Path(x).parts)
    maps['folders'] = maps['all_folders'].apply(lambda x: x[2])
#     print(maps.groupby('folders').count().sort_values('path', ascending= False).index.to_list())
    # maps = maps.loc[maps.folders.isin(maps.groupby('folders').count().sort_values('path', ascending= False).head(8).index.to_list())]
    # maps = maps.loc[maps.folders.isin(['ELSI', 'Haplotype Map Project', 'Sequence target files', 'GWAS materials'])]
    temp = maps.path.tolist()
    temp.append('folders')
    temp.append('path')
    temp.append('all_folders')
    maps = maps[temp]
    folders = maps.pop('folders')
    paths = maps.pop('path')
    all_folders= maps.pop('all_folders')
    maps_2 = pd.DataFrame(np.triu(maps, k = 1) + np.triu(maps).T, index = maps.index, columns = maps.columns)
    
    A = affinity.Uniform(
        maps_2.to_numpy(),
        k_neighbors=10,
        n_jobs=-1,
        verbose=1,
        random_state=42,
    )
    sp.sparse.save_npz(f"../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_affinities_P_22070", A.P)
    I = initialization.pca(maps_2.to_numpy(), random_state=42)
    np.save(f"../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_initialization_22070", I)
    Zs = []
    kls = []
    n_iter = []
    E = TSNEEmbedding(I, A, n_jobs=-1, random_state=42, verbose=True)

    # early exaggeration
    E = E.optimize(n_iter=125, exaggeration=12, momentum=0.5, n_jobs=-1, verbose=True, callbacks=mycallback, callbacks_every_iters=50)

    # exaggeration annealing
    exs = np.linspace(12,1,125)
    for i in range(125):
        if (i+1)%50 == 0:
            E = E.optimize(n_iter=1, exaggeration=exs[i], momentum=0.8, n_jobs=-1, verbose=True, callbacks=mycallback, callbacks_every_iters=1)

        else:
            E = E.optimize(n_iter=1, exaggeration=exs[i], momentum=0.8, n_jobs=-1, verbose=True)

    # final optimization without exaggeration
    E = E.optimize(n_iter=2000, exaggeration=1, momentum=0.8, n_jobs=-1, verbose=True, callbacks=mycallback, callbacks_every_iters=50)
    
    tsne=np.array(E)

    #save
    np.save(f'../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_tsne_22070', tsne)
    
    merged = pd.merge(pd.merge(pd.DataFrame(tsne, columns = ["dim1", "dim2"]), folders.reset_index(drop = True), left_index = True, right_index = True),
                  paths.reset_index(drop = True), left_index = True, right_index = True)
    
    merged['secondary'] = merged['path'].apply(lambda x: x.split('/')[3] if len(x.split('/')) > 2 else None)
    merged['tertiary'] = merged['path'].apply(lambda x: x.split('/')[4] if len(x.split('/')) > 4 else x.split('/')[3])
    
    merged['projects'] = merged.apply(lambda x: projects(x), axis = 1)
    
    merged['color'] = merged['projects'].map(colors)
    merged['color'] = merged['color'].fillna("#B8BABC")
    
    
    
    fig, ax = plt.subplots(1, 1, figsize = (20 , 20), dpi = 300 )
    ax.spines['left'].set_linewidth(0)
    ax.spines['bottom'].set_linewidth(0)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    sns.scatterplot(data = merged.loc[merged.color == "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth =0.5, size =50)
    sns.scatterplot(data = merged.loc[merged.color != "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth = 0.5, size = 50)

    plt.legend(title='Folder structure', loc='upper right',\
               bbox_to_anchor=(1.15, 1.05), frameon = False, ncol = 1)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
    ax.legend().remove()
    sns.despine()
    plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
    plt.tick_params(
        axis='y',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        right=False)

    plt.savefig(f'../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_tsne_22070.png', transparent = True, bbox_inches = 'tight')

In [None]:
for fraction in tqdm([0, 0.01, 0.05, 0.1, 0.2, 0.5]):
    
    together = pd.read_parquet(f'../models/stopwords_filtering/filtered_text_240320_{fraction}_removed_22070.parquet')

    to_put_into_gensim = together[['text', 'nodeID']].set_index('nodeID').groupby('nodeID').apply(lambda x : x.to_numpy().tolist()).to_dict()
    
    corpus_list = []

    for document in tqdm(to_put_into_gensim, total = len(to_put_into_gensim)):
        temp_corpus = ""
        temp_key = ""
        for i in to_put_into_gensim[document]:
            temp_corpus = temp_corpus + i[0] + " "
            temp_key = document
        corpus_list.append(gensim.models.doc2vec.TaggedDocument([token.strip() for token in temp_corpus.split()], [temp_key]))
        
    model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=5, epochs=50, workers = 10)
    model.build_vocab(corpus_list)
    model.train(corpus_list, total_examples=model.corpus_count, epochs=model.epochs)
    model.save(f'../models/stopwords_filtering/doc2vec-model-filtered-text-{fraction}-removed-on-010924-corpus-min-count-5_22070')
    
    indices = []
    columns = []
    similarity_list = []
    for i in range(len(corpus_list)):
        doc_id = corpus_list[i].tags[0]
        columns.append(doc_id)
        inferred_vector = model.infer_vector(corpus_list[i].words)
        sims = model.dv.most_similar([inferred_vector], topn=len(corpus_list))


        similarities = []
        docids = []
        for docid, sim in sims:
            similarities.append(sim)
            docids.append(docid)
        similarity_list.append(similarities)
        indices.append(docids)
    dfs_to_concat = []

    for i in range(len(indices)):
        df = pd.DataFrame(similarity_list[i], columns = [columns[i]], index = indices[i])
        dfs_to_concat.append(df)
    maps = pd.concat(dfs_to_concat, axis = 1)
    with open(f'../models/stopwords_filtering/map-filtered-text-{fraction}-removed-on-010924-corpus-min-count-5_22070.pkl', 'wb') as f:
        pickle.dump(maps, f)
    maps = maps.sort_index(axis = 0)
    maps = maps.sort_index(axis = 1)
    maps = maps.T
    maps= maps.rename_axis('path').reset_index()

    maps['all_folders'] = maps['path'].apply(lambda x: Path(x).parts)
    maps['folders'] = maps['all_folders'].apply(lambda x: x[2])
#     print(maps.groupby('folders').count().sort_values('path', ascending= False).index.to_list())
    # maps = maps.loc[maps.folders.isin(maps.groupby('folders').count().sort_values('path', ascending= False).head(8).index.to_list())]
    # maps = maps.loc[maps.folders.isin(['ELSI', 'Haplotype Map Project', 'Sequence target files', 'GWAS materials'])]
    temp = maps.path.tolist()
    temp.append('folders')
    temp.append('path')
    temp.append('all_folders')
    maps = maps[temp]
    folders = maps.pop('folders')
    paths = maps.pop('path')
    all_folders= maps.pop('all_folders')
    maps_2 = pd.DataFrame(np.triu(maps, k = 1) + np.triu(maps).T, index = maps.index, columns = maps.columns)
    
    A = affinity.Uniform(
        maps_2.to_numpy(),
        k_neighbors=10,
        n_jobs=-1,
        verbose=1,
        random_state=42,
    )
    sp.sparse.save_npz(f"../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_affinities_P-min-count-5_22070", A.P)
    I = initialization.pca(maps_2.to_numpy(), random_state=42)
    np.save(f"../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_initialization-min-count-5_22070", I)
    Zs = []
    kls = []
    n_iter = []
    E = TSNEEmbedding(I, A, n_jobs=-1, random_state=42, verbose=True)

    # early exaggeration
    E = E.optimize(n_iter=125, exaggeration=12, momentum=0.5, n_jobs=-1, verbose=True, callbacks=mycallback, callbacks_every_iters=50)

    # exaggeration annealing
    exs = np.linspace(12,1,125)
    for i in range(125):
        if (i+1)%50 == 0:
            E = E.optimize(n_iter=1, exaggeration=exs[i], momentum=0.8, n_jobs=-1, verbose=True, callbacks=mycallback, callbacks_every_iters=1)

        else:
            E = E.optimize(n_iter=1, exaggeration=exs[i], momentum=0.8, n_jobs=-1, verbose=True)

    # final optimization without exaggeration
    E = E.optimize(n_iter=2000, exaggeration=1, momentum=0.8, n_jobs=-1, verbose=True, callbacks=mycallback, callbacks_every_iters=50)
    
    tsne=np.array(E)

    #save
    np.save(f'../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_tsne-min-count-5_22070', tsne)
    
    merged = pd.merge(pd.merge(pd.DataFrame(tsne, columns = ["dim1", "dim2"]), folders.reset_index(drop = True), left_index = True, right_index = True),
                  paths.reset_index(drop = True), left_index = True, right_index = True)
    
    merged['secondary'] = merged['path'].apply(lambda x: x.split('/')[3] if len(x.split('/')) > 2 else None)
    merged['tertiary'] = merged['path'].apply(lambda x: x.split('/')[4] if len(x.split('/')) > 4 else x.split('/')[3])
    
    merged['projects'] = merged.apply(lambda x: projects(x), axis = 1)
    
    merged['color'] = merged['projects'].map(colors)
    merged['color'] = merged['color'].fillna("#B8BABC")
    
    
    
    fig, ax = plt.subplots(1, 1, figsize = (20 , 20), dpi = 300 )
    ax.spines['left'].set_linewidth(0)
    ax.spines['bottom'].set_linewidth(0)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    sns.scatterplot(data = merged.loc[merged.color == "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth =0.5, size =50)
    sns.scatterplot(data = merged.loc[merged.color != "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth = 0.5, size = 50)

    plt.legend(title='Folder structure', loc='upper right',\
               bbox_to_anchor=(1.15, 1.05), frameon = False, ncol = 1)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
    ax.legend().remove()
    sns.despine()
    plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
    plt.tick_params(
        axis='y',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        right=False)

    plt.savefig(f'../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_tsne-min-count-5_22070.png', transparent = True, bbox_inches = 'tight')

In [None]:
for fraction in tqdm([0.5]):

    together = pd.read_parquet(f'../models/stopwords_filtering/filtered_text_240320_{fraction}_removed_22070.parquet')

    to_put_into_gensim = together[['text', 'nodeID']].set_index('nodeID').groupby('nodeID').apply(lambda x : x.to_numpy().tolist()).to_dict()
    
    corpus_list = []

    for document in tqdm(to_put_into_gensim, total = len(to_put_into_gensim)):
        temp_corpus = ""
        temp_key = ""
        for i in to_put_into_gensim[document]:
            temp_corpus = temp_corpus + i[0] + " "
            temp_key = document
        corpus_list.append(gensim.models.doc2vec.TaggedDocument([token.strip() for token in temp_corpus.split()], [temp_key]))
        
#     model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=5, epochs=50, workers = 10)
#     model.build_vocab(corpus_list)
#     model.train(corpus_list, total_examples=model.corpus_count, epochs=model.epochs)
#     model.save(f'../models/stopwords_filtering/doc2vec-model-filtered-text-{fraction}-removed-on-010924-corpus-min-count-5_22070')
    model = gensim.models.doc2vec.Doc2Vec.load(f'../models/stopwords_filtering/doc2vec-model-filtered-text-{fraction}-removed-on-010924-corpus-min-count-5_22070')
    
    indices = []
    columns = []
    similarity_list = []
    for i in range(len(corpus_list)):
        doc_id = corpus_list[i].tags[0]
        columns.append(doc_id)
        inferred_vector = model.infer_vector(corpus_list[i].words)
        sims = model.dv.most_similar([inferred_vector], topn=len(corpus_list))


        similarities = []
        docids = []
        for docid, sim in sims:
            similarities.append(sim)
            docids.append(docid)
        similarity_list.append(similarities)
        indices.append(docids)
    dfs_to_concat = []

    for i in range(len(indices)):
        df = pd.DataFrame(similarity_list[i], columns = [columns[i]], index = indices[i])
        dfs_to_concat.append(df)
    maps = pd.concat(dfs_to_concat, axis = 1)
    with open(f'../models/stopwords_filtering/map-filtered-text-{fraction}-removed-on-010924-corpus-min-count-5_22070.pkl', 'wb') as f:
        pickle.dump(maps, f)
    maps = maps.sort_index(axis = 0)
    maps = maps.sort_index(axis = 1)
    maps = maps.T
    maps= maps.rename_axis('path').reset_index()

    maps['all_folders'] = maps['path'].apply(lambda x: Path(x).parts)
    maps['folders'] = maps['all_folders'].apply(lambda x: x[2])
#     print(maps.groupby('folders').count().sort_values('path', ascending= False).index.to_list())
    # maps = maps.loc[maps.folders.isin(maps.groupby('folders').count().sort_values('path', ascending= False).head(8).index.to_list())]
    # maps = maps.loc[maps.folders.isin(['ELSI', 'Haplotype Map Project', 'Sequence target files', 'GWAS materials'])]
    temp = maps.path.tolist()
    temp.append('folders')
    temp.append('path')
    temp.append('all_folders')
    maps = maps[temp]
    folders = maps.pop('folders')
    paths = maps.pop('path')
    all_folders= maps.pop('all_folders')
    maps_2 = pd.DataFrame(np.triu(maps, k = 1) + np.triu(maps).T, index = maps.index, columns = maps.columns)
    
    A = affinity.Uniform(
        maps_2.to_numpy(),
        k_neighbors=10,
        n_jobs=-1,
        verbose=1,
        random_state=42,
    )
    sp.sparse.save_npz(f"../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_affinities_P-min-count-5_22070", A.P)
    I = initialization.pca(maps_2.to_numpy(), random_state=42)
    np.save(f"../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_initialization-min-count-5_22070", I)
    Zs = []
    kls = []
    n_iter = []
    E = TSNEEmbedding(I, A, n_jobs=-1, random_state=42, verbose=True)

    # early exaggeration
    E = E.optimize(n_iter=125, exaggeration=12, momentum=0.5, n_jobs=-1, verbose=True, callbacks=mycallback, callbacks_every_iters=50)

    # exaggeration annealing
    exs = np.linspace(12,1,125)
    for i in range(125):
        if (i+1)%50 == 0:
            E = E.optimize(n_iter=1, exaggeration=exs[i], momentum=0.8, n_jobs=-1, verbose=True, callbacks=mycallback, callbacks_every_iters=1)

        else:
            E = E.optimize(n_iter=1, exaggeration=exs[i], momentum=0.8, n_jobs=-1, verbose=True)

    # final optimization without exaggeration
    E = E.optimize(n_iter=2000, exaggeration=1, momentum=0.8, n_jobs=-1, verbose=True, callbacks=mycallback, callbacks_every_iters=50)
    
    tsne=np.array(E)

    #save
    np.save(f'../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_tsne-min-count-5_22070', tsne)
    
    merged = pd.merge(pd.merge(pd.DataFrame(tsne, columns = ["dim1", "dim2"]), folders.reset_index(drop = True), left_index = True, right_index = True),
                  paths.reset_index(drop = True), left_index = True, right_index = True)
    
    merged['secondary'] = merged['path'].apply(lambda x: x.split('/')[3].split("---")[0] if len(x.split('/')) > 2 else None)
    merged['tertiary'] = merged['path'].apply(lambda x: x.split('/')[4].split("---")[0] if len(x.split('/')) > 4 else x.split('/')[3].split('---')[0])
    
    merged['projects'] = merged.apply(lambda x: projects(x), axis = 1)
    
    merged['color'] = merged['projects'].map(colors)
    merged['color'] = merged['color'].fillna("#B8BABC")
    
    
    
    fig, ax = plt.subplots(1, 1, figsize = (20 , 20), dpi = 300 )
    ax.spines['left'].set_linewidth(0)
    ax.spines['bottom'].set_linewidth(0)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    sns.scatterplot(data = merged.loc[merged.color == "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth =0.5, size =50)
    sns.scatterplot(data = merged.loc[merged.color != "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth = 0.5, size = 50)

    plt.legend(title='Folder structure', loc='upper right',\
               bbox_to_anchor=(1.15, 1.05), frameon = False, ncol = 1)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
    ax.legend().remove()
    sns.despine()
    plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
    plt.tick_params(
        axis='y',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        right=False)

    plt.savefig(f'../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_tsne-min-count-5_22070.png', transparent = True, bbox_inches = 'tight')

In [None]:
def plot_tsne(fraction, ax):
    with open(f'../models/stopwords_filtering/map-filtered-text-{fraction}-removed-on-010924-corpus_22070.pkl', 'rb') as f:
        maps = pickle.load(f)
    maps = maps.sort_index(axis = 0)
    maps = maps.sort_index(axis = 1)
    maps = maps.T
    maps= maps.rename_axis('path').reset_index()

    maps['all_folders'] = maps['path'].apply(lambda x: Path(x).parts)
    maps['folders'] = maps['all_folders'].apply(lambda x: x[2])
#     print(maps.groupby('folders').count().sort_values('path', ascending= False).index.to_list())
    # maps = maps.loc[maps.folders.isin(maps.groupby('folders').count().sort_values('path', ascending= False).head(8).index.to_list())]
    # maps = maps.loc[maps.folders.isin(['ELSI', 'Haplotype Map Project', 'Sequence target files', 'GWAS materials'])]
    temp = maps.path.tolist()
    temp.append('folders')
    temp.append('path')
    temp.append('all_folders')
    maps = maps[temp]
    folders = maps.pop('folders')
    paths = maps.pop('path')
    all_folders= maps.pop('all_folders')
    
    
    tsne = np.load(f'../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_tsne_22070.npy')
    merged = pd.merge(pd.merge(pd.DataFrame(tsne, columns = ["dim1", "dim2"]), folders.reset_index(drop = True), left_index = True, right_index = True),
                  paths.reset_index(drop = True), left_index = True, right_index = True)
    
    merged['secondary'] = merged['path'].apply(lambda x: x.split('/')[3].split("---")[0] if len(x.split('/')) > 2 else None)
    merged['tertiary'] = merged['path'].apply(lambda x: x.split('/')[4].split("---")[0] if len(x.split('/')) > 4 else x.split('/')[3].split('---')[0])
    
    merged['projects'] = merged.apply(lambda x: projects(x), axis = 1)
    
    merged['color'] = merged['projects'].map(colors)
    merged['color'] = merged['color'].fillna("#B8BABC")
    ax.spines['left'].set_linewidth(0)
    ax.spines['bottom'].set_linewidth(0)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    sns.scatterplot(data = merged.loc[merged.color == "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth =0.5, size =50)
    sns.scatterplot(data = merged.loc[merged.color != "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth = 0.5, size = 50)

#     plt.legend(title='Folder structure', loc='upper right',\
#                bbox_to_anchor=(1.15, 1.05), frameon = False, ncol = 1)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
    ax.legend().remove()
    sns.despine()

    
    ax.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
    ax.tick_params(
        axis='y',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        right=False)
    

In [None]:
logging.basicConfig(level=logging.ERROR)

In [None]:
%%time 
layout = [
    [0, 1, 2],
    [3, 4, 5],
]

fig, axes = plt.subplot_mosaic(layout, figsize=(35,35))
fractions = [0, 0.01, 0.05, 0.1, 0.2, 0.5]
for i in range(len(fractions)):
    plot_tsne(fractions[i], axes[i])
    axes[i].set_title(f"{int(fractions[i] * 100)}% of tokens removed", fontsize = 40)
    axes[i].set_aspect('equal')
fig.tight_layout()
plt.show()

In [None]:
def plot_tsne(fraction, ax):
    with open(f'../models/stopwords_filtering/map-filtered-text-{fraction}-removed-on-010924-corpus-min-count-5_22070.pkl', 'rb') as f:
        maps = pickle.load(f)
    maps = maps.sort_index(axis = 0)
    maps = maps.sort_index(axis = 1)
    maps = maps.T
    maps= maps.rename_axis('path').reset_index()

    maps['all_folders'] = maps['path'].apply(lambda x: Path(x).parts)
    maps['folders'] = maps['all_folders'].apply(lambda x: x[2])
#     print(maps.groupby('folders').count().sort_values('path', ascending= False).index.to_list())
    # maps = maps.loc[maps.folders.isin(maps.groupby('folders').count().sort_values('path', ascending= False).head(8).index.to_list())]
    # maps = maps.loc[maps.folders.isin(['ELSI', 'Haplotype Map Project', 'Sequence target files', 'GWAS materials'])]
    temp = maps.path.tolist()
    temp.append('folders')
    temp.append('path')
    temp.append('all_folders')
    maps = maps[temp]
    folders = maps.pop('folders')
    paths = maps.pop('path')
    all_folders= maps.pop('all_folders')
    
    
    tsne = np.load(f'../models/stopwords_filtering/filtered-text-{fraction}-removed-on-010924-corpus_tsne-min-count-5_22070.npy')
    merged = pd.merge(pd.merge(pd.DataFrame(tsne, columns = ["dim1", "dim2"]), folders.reset_index(drop = True), left_index = True, right_index = True),
                  paths.reset_index(drop = True), left_index = True, right_index = True)
    
    merged['secondary'] = merged['path'].apply(lambda x: x.split('/')[3].split("---")[0] if len(x.split('/')) > 2 else None)
    merged['tertiary'] = merged['path'].apply(lambda x: x.split('/')[4].split("---")[0] if len(x.split('/')) > 4 else x.split('/')[3].split('---')[0])
    
    merged['projects'] = merged.apply(lambda x: projects(x), axis = 1)
    
    merged['color'] = merged['projects'].map(colors)
    merged['color'] = merged['color'].fillna("#B8BABC")
    ax.spines['left'].set_linewidth(0)
    ax.spines['bottom'].set_linewidth(0)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    sns.scatterplot(data = merged.loc[merged.color == "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth =0.5, size =50)
    sns.scatterplot(data = merged.loc[merged.color != "#B8BABC"], x = 'dim1', y = 'dim2', \
                    hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                    alpha = 1, ax = ax, linewidth = 0.5, size = 50)

#     plt.legend(title='Folder structure', loc='upper right',\
#                bbox_to_anchor=(1.15, 1.05), frameon = False, ncol = 1)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
    ax.legend().remove()
    sns.despine()

    
    ax.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
    ax.tick_params(
        axis='y',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        right=False)
    

In [None]:
%%time 
layout = [
    [0, 1, 2],
    [3, 4, 5],
]

fig, axes = plt.subplot_mosaic(layout, figsize=(35,35))
fractions = [0, 0.01, 0.05, 0.1, 0.2, 0.5]
for i in range(len(fractions)):
    plot_tsne(fractions[i], axes[i])
    axes[i].set_title(f"{int(fractions[i] * 100)}% of tokens removed", fontsize = 40)
    axes[i].set_aspect('equal')
fig.tight_layout()
plt.show()

In [None]:
with open(f'../models/stopwords_filtering/map-filtered-text-{0}-removed-on-010924-corpus_22070.pkl', 'rb') as f:
    maps = pickle.load(f)
maps = maps.sort_index(axis = 0)
maps = maps.sort_index(axis = 1)
maps = maps.T
maps= maps.rename_axis('path').reset_index()

maps['all_folders'] = maps['path'].apply(lambda x: Path(x).parts)
maps['folders'] = maps['all_folders'].apply(lambda x: x[2])
#     print(maps.groupby('folders').count().sort_values('path', ascending= False).index.to_list())
# maps = maps.loc[maps.folders.isin(maps.groupby('folders').count().sort_values('path', ascending= False).head(8).index.to_list())]
# maps = maps.loc[maps.folders.isin(['ELSI', 'Haplotype Map Project', 'Sequence target files', 'GWAS materials'])]
temp = maps.path.tolist()
temp.append('folders')
temp.append('path')
temp.append('all_folders')
maps = maps[temp]
folders = maps.pop('folders')
paths = maps.pop('path')
all_folders= maps.pop('all_folders')


tsne = np.load(f'../models/stopwords_filtering/filtered-text-{0}-removed-on-010924-corpus_tsne_22070.npy')
merged = pd.merge(pd.merge(pd.DataFrame(tsne, columns = ["dim1", "dim2"]), folders.reset_index(drop = True), left_index = True, right_index = True),
              paths.reset_index(drop = True), left_index = True, right_index = True)

merged['secondary'] = merged['path'].apply(lambda x: x.split('/')[3].split("---")[0] if len(x.split('/')) > 2 else None)
merged['tertiary'] = merged['path'].apply(lambda x: x.split('/')[4].split("---")[0] if len(x.split('/')) > 4 else x.split('/')[3].split('---')[0])
    
merged['projects'] = merged.apply(lambda x: projects(x), axis = 1)

merged['color'] = merged['projects'].map(colors)
merged['color'] = merged['color'].fillna("#B8BABC")

In [None]:
import matplotlib
sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False
matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 5,
"font.size":7})


matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5


fig, ax = plt.subplots(1, 1, figsize = (20 , 20), dpi = 300 )
ax.spines['left'].set_linewidth(0)
ax.spines['bottom'].set_linewidth(0)

ax.set_xticklabels([])
ax.set_yticklabels([])
# sns.scatterplot(data = merged.loc[merged.folders_x == "#D3D3D3"], x = 'dim1', y = 'dim2',\
#                 hue = 'folders_y', palette = merged.loc[merged.folders_x == "#D3D3D3"].set_index('folders_y')['folders_x'].to_dict(),\
#                 alpha = 0.5, ax = ax)
# ax.get_legend().remove()
sns.scatterplot(data = merged.loc[merged.color == "#B8BABC"], x = 'dim1', y = 'dim2', \
                hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                alpha = 1, ax = ax, linewidth =0.5, size =50)
sns.scatterplot(data = merged.loc[merged.color != "#B8BABC"], x = 'dim1', y = 'dim2', \
                hue = 'projects', palette = merged.set_index('projects')['color'].to_dict(),\
                alpha = 1, ax = ax, linewidth = 0.5, size = 50)

plt.legend(title='Folder structure', loc='upper right',\
           bbox_to_anchor=(1.15, 1.05), frameon = False, ncol = 1)
ax.set_ylabel(None)
ax.set_xlabel(None)
ax.legend().remove()
sns.despine()
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False) # labels along the bottom edge are off
plt.tick_params(
    axis='y',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    left=False,      # ticks along the bottom edge are off
    right=False)

plt.savefig('../cache/tsne_map_legend_240320.png', transparent = True, bbox_inches = 'tight')

In [None]:
project_folders = ["1000 Genomes", "American Gene-Environment Study (AGES)", 
                  "Human Genome Project", "ELSI", "ENCODE", "FlyBase Materials",
                  "GAIN files for Jim", "GGR", "GWAS", "GenPhen", "Genes & Environment",
                  "H3Africa", "HapMap", "Healthy People", "PAGE", "ZLSAC", "eMERGE",
                  "modENCODE"]
len(project_folders)

In [None]:
# Create layout
layout = [
    [0, 1, 2],
    [3, 4, 5],
    [6, 7, 8],
    [9, 10, 11],
    [12, 13, 14],
    [15, 16, 17],
]

fig, axes = plt.subplot_mosaic(layout, figsize=(50,50))

for ax, project in zip(axes.values(), sorted(project_folders, key= str.casefold)):
    ax.spines['left'].set_linewidth(0)
    ax.spines['bottom'].set_linewidth(0)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    
    if project == "Human Genome Project":
        sns.scatterplot(data = merged.loc[merged.color != "black"], x = 'dim1', y = 'dim2', \
                       color = "#d3d3d3",\
                        alpha = 1, ax = ax, linewidth = 0.5, size = 2)
        sns.scatterplot(data = merged.loc[merged.color == "black"], x = 'dim1', y = 'dim2', \
                     color = "red",\
                    alpha = 1, ax = ax, linewidth =0.5, size =2)
        size = merged.loc[merged.color == "black"]
        ax.set_title(f"Human Genome Project\nN={size.shape[0]}", fontsize = 40)
    elif project == "ZLSAC":
        sns.scatterplot(data = merged.loc[merged.projects != "LSAC"], x = 'dim1', y = 'dim2', \
                         color = "#d3d3d3",\
                        alpha = 1, ax = ax, linewidth = 0.5, size = 2)
        sns.scatterplot(data = merged.loc[merged.projects == "LSAC"], x = 'dim1', y = 'dim2', \
                    color = "red",\
                    alpha = 1, ax = ax, linewidth =0.5, size =2)
        size = merged.loc[merged.projects == "LSAC"]
        ax.set_title(f"Sequence target files (LSAC)\nN={size.shape[0]}", fontsize = 40)
    else:
        sns.scatterplot(data = merged.loc[merged.projects != project], x = 'dim1', y = 'dim2', \
                         color = "#d3d3d3",\
                        alpha = 1, ax = ax, linewidth = 0.5, size = 2)
        sns.scatterplot(data = merged.loc[merged.projects == project], x = 'dim1', y = 'dim2', \
                    color = "red",\
                    alpha = 1, ax = ax, linewidth =0.5, size =2)
        if project == "GAIN files for Jim":
            ax.set_title(f"GAIN files for [redacted] (Genetic Association Information Network)\nN={merged.loc[merged.projects == project].shape[0]}", fontsize = 40)
        elif project == "GGR":
            ax.set_title(f"GGR (Genomics of Gene Regulation)\nN={merged.loc[merged.projects == project].shape[0]}", fontsize= 40)
        elif project == "ELSI":
            ax.set_title(f"ELSI (Ethical, legal, and social implications research)\nN={merged.loc[merged.projects == project].shape[0]}", fontsize =40)
        elif project == "ENCODE":
            ax.set_title(f"ENCODE (The Encyclopedia of DNA Elements)\nN={merged.loc[merged.projects == project].shape[0]}", fontsize = 40)
        elif project == "modENCODE":
            ax.set_title(f"modENCODE (model organisms ENCODE)\nN={merged.loc[merged.projects == project].shape[0]}", fontsize = 40)
        elif project == "GWAS":
            ax.set_title(f"GWAS materials (NHGRI-EBI GWAS Catalog)\nN={merged.loc[merged.projects == project].shape[0]}", fontsize = 40)
        elif project == "HapMap":
            ax.set_title(f"Haplotype Map Project (International HapMap Project)\nN={merged.loc[merged.projects == project].shape[0]}", fontsize = 40)
        elif project == "LSAC":
            ax.set_title(f"Sequence target files (LSAC)\nN={merged.loc[merged.projects == project].shape[0]}", fontsize = 40)
        elif project == "PAGE":
            ax.set_title(f"PAGE (Population Architecture Using Genomics and Epidemiology)\nN={merged.loc[merged.projects == project].shape[0]}", fontsize = 40)
        else:
            ax.set_title(f"{project}\nN={merged.loc[merged.projects == project].shape[0]}", fontsize = 40)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
    ax.set_aspect('equal')
    ax.legend().remove()
    sns.despine()
    ax.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
    ax.tick_params(
        axis='y',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        right=False)

fig.tight_layout()
plt.savefig("../figures/all_project_folders_tsne_240318.pdf", bbox_inches = "tight", dpi = 300)

In [None]:
project_folders

In [None]:
non_project_folders = ['1993 to 2003',
 '2003 to present',
 'Beijing Genomics Institute (BGI)',
 'Bioinformatics History',
 'Bioinformatics Materials',
 'Computational Analysis of non-coding DNA',
 'Epigenetics Roadmap',
 'Finishing Manual',
 'HGP History Summer 2011',
 'Libraries',
 'Mouse SNP Meeting Materials',
 'NHGRI Key early history',
 'QA, Accuracy Discussions, Pilots MG',
 'Yeast',
 'comparative and organismal sequencing',
 'files for image analysis',
 'human genome reference consortium',
 'model organism databases resource informatics',
    'Celera']

len(non_project_folders)


In [None]:
# Create layout
layout = [
    [0, 1, 2, 3],
    [4, 5, 6, 7],
    [8, 9, 10, 11],
    [12, 13, 14, 15],
    [16, 17, 18, '.'],
]

fig, axes = plt.subplot_mosaic(layout, figsize=(50,50))

for ax, project in zip(axes.values(), sorted(non_project_folders, key = str.lower)):
    ax.spines['left'].set_linewidth(0)
    ax.spines['bottom'].set_linewidth(0)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    
    if project == "Celera":
        sns.scatterplot(data = merged.loc[merged.folders != project], x = 'dim1', y = 'dim2', \
                     color = "#d3d3d3",\
                    alpha = 1, ax = ax, linewidth = 0.5, size = 2)
        sns.scatterplot(data = merged.loc[merged.folders == project], x = 'dim1', y = 'dim2', \
                color = "red",\
                alpha = 1, ax = ax, linewidth =0.5, size =2)
        
        ax.set_title(project + "\nN=" + str(merged.loc[merged.folders == project].shape[0]), fontsize = 40)
    else:
        sns.scatterplot(data = merged.loc[merged.projects != project], x = 'dim1', y = 'dim2', \
                         color = "#d3d3d3",\
                        alpha = 1, ax = ax, linewidth = 0.5, size = 2)
        sns.scatterplot(data = merged.loc[merged.projects == project], x = 'dim1', y = 'dim2', \
                    color = "red",\
                    alpha = 1, ax = ax, linewidth =0.5, size =2)
        ax.set_title(project + "\nN=" + str(merged.loc[merged.projects == project].shape[0]), fontsize = 40)

# plt.legend(title='Folder structure', loc='upper right',\
#            bbox_to_anchor=(1.15, 1.05), frameon = False, ncol = 1)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
    ax.set_aspect('equal')
    ax.legend().remove()
    
    sns.despine()
    ax.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
    ax.tick_params(
        axis='y',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        right=False)

fig.tight_layout()
plt.savefig("../figures/all_nonproject_folders_tsne_240318.pdf", bbox_inches = "tight", dpi = 300)




In [None]:
merged['extension'] = merged['path'].str.split('---').str[0].str.split('.').str[-1]

In [None]:
merged.loc[((merged.color== 'black')|(merged.projects.isin(project_folders))
             | (merged.projects == "LSAC"))].shape[0] - merged.loc[merged.folders == "Celera"].shape[0]

In [None]:
merged.loc[~((merged.color== 'black')|(merged.projects.isin(project_folders))
             | (merged.projects == "LSAC") | (merged.folders.isin(non_project_folders)))]

In [None]:
merged.loc[merged.folders.isin(non_project_folders)].shape

In [None]:
merged.shape

In [None]:
15117 + 6953

In [None]:
15117/22070

In [None]:
# Create layout
layout = [
    [0, 1, 2],
    [3, 4, '.'],
]

fig, axes = plt.subplot_mosaic(layout, figsize=(35,35))

for ax, extension in zip(axes.values(), merged.extension.unique()):
    ax.spines['left'].set_linewidth(0)
    ax.spines['bottom'].set_linewidth(0)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    
    sns.scatterplot(data = merged.loc[merged.extension != extension], x = 'dim1', y = 'dim2', \
                     color = "#d3d3d3",\
                    alpha = 1, ax = ax, linewidth = 0.5, size = 2)
    sns.scatterplot(data = merged.loc[merged.extension == extension], x = 'dim1', y = 'dim2', \
                color = "red",\
                alpha = 1, ax = ax, linewidth =0.5, size =2)


# plt.legend(title='Folder structure', loc='upper right',\
#            bbox_to_anchor=(1.15, 1.05), frameon = False, ncol = 1)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
    ax.set_aspect('equal')
    ax.legend().remove()
    size = merged.loc[merged.extension == extension].shape[0]
    ax.set_title(f"{extension}\nN={size}", fontsize = 40)
    sns.despine()
    ax.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
    ax.tick_params(
        axis='y',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        right=False)

fig.tight_layout()
plt.show()




In [None]:
1010 + 310 + 20310 + 364 + 76

In [None]:
all_documents_with_dates = pd.read_parquet('../cache/pdfs_word_excel_powerpoint_031924_with_dates_for_all.parquet')[['nodeID', 'date']]

In [None]:
merged['nodeID'] = merged.path.str.split('---').str[-1]
merged_with_dates = pd.merge(merged, all_documents_with_dates, on = 'nodeID', how = 'left')
merged_with_dates['decade'] = merged_with_dates['date'].apply(lambda x: (x.year//10)*10)

In [None]:
# Create layout
layout = [
    [0, 1],
    [2, 3],
    [4, '.']
]

fig, axes = plt.subplot_mosaic(layout, figsize=(40,40))

for ax, decade in zip(axes.values(), sorted(merged_with_dates.decade.unique())):
    ax.spines['left'].set_linewidth(0)
    ax.spines['bottom'].set_linewidth(0)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    
    if pd.isnull(decade):
        size = merged_with_dates.loc[merged_with_dates.decade.isna()]
        ax.set_title(f"No date inferred\nN={size.shape[0]}", fontsize = 40)
        
        sns.scatterplot(data = merged_with_dates.loc[merged_with_dates.decade.notna()], x = 'dim1', y = 'dim2', \
                         color = "#d3d3d3",\
                        alpha = 1, ax = ax, linewidth = 0.5, size = 2)
        sns.scatterplot(data = merged_with_dates.loc[merged_with_dates.decade.isna()], x = 'dim1', y = 'dim2', \
                    color = "red",\
                    alpha = 1, ax = ax, linewidth =0.5, size =2)
    else:
        size = merged_with_dates.loc[merged_with_dates.decade == decade]
        ax.set_title(f"{int(decade)}s\nN={size.shape[0]}", fontsize = 40)
        
        sns.scatterplot(data = merged_with_dates.loc[merged_with_dates.decade != decade], x = 'dim1', y = 'dim2', \
                         color = "#d3d3d3",\
                        alpha = 1, ax = ax, linewidth = 0.5, size = 2)
        sns.scatterplot(data = merged_with_dates.loc[merged_with_dates.decade == decade], x = 'dim1', y = 'dim2', \
                    color = "red",\
                    alpha = 1, ax = ax, linewidth =0.5, size =2)


# plt.legend(title='Folder structure', loc='upper right',\
#            bbox_to_anchor=(1.15, 1.05), frameon = False, ncol = 1)
    ax.set_ylabel(None)
    ax.set_xlabel(None)
    ax.set_aspect('equal')
    ax.legend().remove()
    
    
    sns.despine()
    ax.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
    ax.tick_params(
        axis='y',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        left=False,      # ticks along the bottom edge are off
        right=False)

fig.tight_layout()
plt.show()




In [None]:
724 + 4591 + 7555 + 603 + 8593

In [None]:
pd.concat([pd.DataFrame([0, 1, 2]), pd.DataFrame([0, 1, 2], index = [2, 1, 0])], axis = 1)