In [2]:
# ! pip install git+https://github.com/openai/whisper.git
# ! pip install pytube
# ! pip install pypdf langchain

## 1. Papers

In [176]:
import os
import requests
from langchain.document_loaders import PyPDFLoader
import pandas as pd
from tqdm import tqdm

In [177]:
def get_inspire_hep_papers(author):
    inspire_url = "https://inspirehep.net/api/literature"
    params = {
        "q": f"author:{author}",
        "size": 1000,  # Increase this value if the author has more than 1000 papers
        "fields": "arxiv_eprints",
    }
    response = requests.get(inspire_url, params=params)
    data = response.json()
    papers = data["hits"]["hits"]
    return papers

def extract_arxiv_ids(papers):
    arxiv_ids = []
    for paper in papers:
        arxiv_eprints = paper['metadata'].get("arxiv_eprints", [])
        if arxiv_eprints:
            arxiv_id = arxiv_eprints[0]["value"]
            arxiv_ids.append(arxiv_id)
    return arxiv_ids

def download_arxiv_pdf(arxiv_id, output_dir="../data/pdfs"):
    pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(pdf_url)

    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"{arxiv_id}.pdf".replace("/", "_"))

    with open(output_path, "wb") as f:
        f.write(response.content)
        
def get_pdf_filenames(directory):
    all_files = os.listdir(directory)
    pdf_files = [file for file in all_files if file.lower().endswith('.pdf')]
    return pdf_files

In [181]:
pdf_dir = '../data/pdfs/'

papers = get_inspire_hep_papers("Jesse.Thaler.1")
arxiv_ids = extract_arxiv_ids(papers)
[download_arxiv_pdf(arxiv_id) for arxiv_id in tqdm(arxiv_ids)];

In [180]:
source_type = []
source_location = []
text = []

for i, file in enumerate(tqdm(get_pdf_filenames(pdf_dir))):
    loader = PyPDFLoader("{}/{}".format(pdf_dir, file))
    pages = loader.load_and_split()
    
    text.append(''.join([page.page_content for page in pages]))
    source_type.append("paper")
    source_location.append("https://arxiv.org/abs/{}".format(file.replace('_', '/')))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 142/142 [05:40<00:00,  2.40s/it]


In [182]:
data = [source_type, source_location, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type', 'source_location', 'text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

In [183]:
df.to_csv('../data/text/df_text.csv', index=False)

## 2. YouTube videos

## 3. Interviews

## 4. Articles/Websites/CV

In [96]:
from tqdm.notebook import tqdm
import whisper
import pytube
from pathlib import Path
import subprocess
import numpy as np

In [13]:
whisper_model = whisper.load_model("tiny.en").to('cuda')

In [14]:
options = whisper.DecodingOptions(language="en", without_timestamps=True)

In [15]:
url = "https://www.youtube.com/watch?v=dqxdPNzBY0I"
pytube_vid = pytube.YouTube(url)

video_path_local = Path(".").resolve() / (pytube_vid.video_id+".mp4")
pytube_vid.streams.filter(type="audio", mime_type="audio/mp4", abr="48kbps").first().download(output_path=video_path_local.parent, filename=video_path_local.name)

'/n/holystore01/LABS/iaifi_lab/Users/smsharma/jesse-embedding/notebooks/dqxdPNzBY0I.mp4'

In [41]:
video_path_local = video_path_local.with_suffix(".wav")
result  = subprocess.run(["ffmpeg", "-i", str(video_path_local.with_suffix(".mp4")), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(video_path_local)])

In [30]:
transcription = whisper.transcribe(whisper_model, str(video_path_local))

In [45]:
openai.api_key = ""

In [94]:
def sliding_window(text, window_size, stride):
    tokens = text.split()
    window_start = 0
    while window_start < len(tokens):
        window_end = min(window_start + window_size, len(tokens))
        yield ' '.join(tokens[window_start:window_end])
        window_start += stride
        
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

def semantic_search(query_embedding, embeddings):
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    ranked_indices = np.argsort(-similarities)
    return ranked_indices

def answer_question(chunk, question, model="text-davinci-002", max_tokens=150, temperature=0.7):
    prompt = f"{chunk}\nQuestion: {question}\nAnswer:"
    response = openai.Completion.create(
        engine=model,
        prompt=prompt,
        max_tokens=max_tokens,
        n=1,
        stop=None,
        temperature=temperature,
    )

    answer = response.choices[0].text.strip()
    return answer

In [79]:
window_size = 1024
stride = 512

text_chunks = list(sliding_window(transcription['text'], window_size, stride))
emeddings = [get_embedding(text) for text in tqdm(text_chunks)]

  0%|          | 0/26 [00:00<?, ?it/s]

In [91]:
query = "will real time decision making be relevant for the LHC? what other experiments will it be relevant for?"
query_embedding = get_embedding(query)

In [92]:
ranked_indices = semantic_search(np.array(query_embedding), np.array(emeddings))
most_relevant_chunk = text_chunks[ranked_indices[0]]
answer = answer_question(most_relevant_chunk, query)

In [93]:
answer

'Real-time decision making may be relevant for the LHC in the context of triggering. Triggering is the process of deciding which collisions to save and which to discard. It may also be relevant for other experiments, such as LIGO, which is looking at how to control their gravitational wave observations.'