In [1]:
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from typing import Dict, List, Tuple

In [2]:
import sys
import os

app = "/app"
if app not in sys.path:
    sys.path.append(app)

In [3]:
from src.sampler import Sampler
from src.corpus import Corpus
from src.embed import OpenAIEmbedder, SBERTEmbedder, BarlowParagraphEmbedder, BarlowSentenceEmbedder
from src.utils import split_paragraphs, split_sentences
from src.config import config

In [4]:
config

Config(embedding_model={'openai': 'text-embedding-ada-002', 'sbert': 'all-mpnet-base-v2', 'barlow': '/app/pretrained/twin-lm-checkpoint-32000'}, sections='introduction+results+discussion+methods')

In [5]:
# import openai
# openai.Model.list()

In [6]:
# note default model from config.embeeding_models can be overriden by model="blah"
# embedder = SBERTEmbedder()
embedder = OpenAIEmbedder()
# embedder = BarlowParagraphEmbedder(model="/app/pretrained/twin-no-lm-checkpoint-32000")
# embedder = BarlowSentenceEmbedder()
print(embedder.model)

text-embedding-ada-002


In [7]:
doi_list = [
    '10.1101/2021.05.12.443743',  # 3 reviews
    '10.1101/2022.11.25.517987',  # 2 reviews
    '10.1101/2022.01.04.474903',  # 3 reviews
    '10.1101/2022.09.09.507210',  # 3 reviews
    '10.1101/2022.08.24.504515',
    '10.1101/2022.09.08.507099',
    '10.1101/2022.04.17.488591',
    '10.1101/2022.08.24.505080',
    '10.1101/2021.12.21.473685',
    '10.1101/2021.07.31.454568',
]

In [8]:
# fist time, download reviews and preprint and save corpus to disk
# corpus = Corpus(doi_list)
# corpus.save('test_corpus')

In [9]:
# once save, restore from disk
corpus = Corpus().from_dir('test_corpus')

In [22]:
sampler = Sampler(corpus, embedder, split_paragraphs)

In [23]:
distro = sampler.sample(n_sample=5)

In [24]:
null = pd.DataFrame()
null['similarity'] = distro['null']
null['distro'] = 'null'
N_null = null.shape[0]
N_null

5323

In [25]:
enriched = pd.DataFrame()
enriched['similarity'] = distro['enriched']
enriched['distro'] = 'enriched'
N_enriched = enriched.shape[0]
N_enriched

4020

In [34]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=enriched.similarity, histnorm="percent", name="enriched", xbins=go.histogram.XBins(size=0.0025)))
fig.add_trace(go.Histogram(x=null.similarity, histnorm="percent",name="null", xbins=go.histogram.XBins(size=0.0025)))

# Overlay both histograms
fig.update_layout(
    title_text=f"Distribution with {embedder.model}",
    barmode='overlay',
    template="plotly_dark",
)
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.9)
fig.show()

In [35]:
cutoff_pos = enriched.similarity.quantile(q=0.99)
cutoff_pos

0.9160430020093918

In [36]:
cutoff_neg = null.similarity.quantile(q=0.99)
cutoff_neg

0.8105259239673615