In [1]:
# ! pip install openai
# ! pip install git+https://github.com/openai/whisper.git
# ! pip install pytube
# ! pip install pypdf langchain

## 1. Papers

In [2]:
import os
import sys
sys.path.append("../")

import pandas as pd
from tqdm import tqdm

from langchain.document_loaders import PyPDFLoader

from utils.arxiv_utils import get_inspire_hep_papers, extract_arxiv_ids, download_arxiv_source, remove_latex_preamble
from utils.db_utils import update_dataframe, delete_files_except_extensions, get_filenames_with_extensions, scrape_website_text

%load_ext autoreload
%autoreload 2

In [3]:
pdf_dir = '../data/papers/'  # directory to store PDFs
db_dir = '../data/db/'  # directory to store database CSVs
txt_dir = '../data/interviews/'  # directory to store interview transcripts

In [4]:
try:
    os.makedirs(pdf_dir)
except FileExistsError:
    print(f"Directory '{pdf_dir}' already exists")

# Get papers from INSPIRE-HEP
papers = get_inspire_hep_papers("Jesse.Thaler.1", 2021) 

# Extract arXiv IDs from papers
arxiv_ids = extract_arxiv_ids(papers)

# Download papers (sources if available, otherwise PDFs)
[download_arxiv_source(arxiv_id, output_dir=pdf_dir) for arxiv_id in tqdm(arxiv_ids)];  

Directory '../data/papers/' already exists


  4%|▍         | 1/26 [00:05<02:16,  5.47s/it]

Found 1 .tex files in 2112.05722
They are: ['../data/papers/2112.05722/main.tex']
Using ../data/papers/2112.05722/main.tex as the main .tex file


  8%|▊         | 2/26 [00:22<05:00, 12.50s/it]

Found 1 .tex files in 2205.02857
They are: ['../data/papers/2205.02857/celestial_nongaussianity.tex']
Using ../data/papers/2205.02857/celestial_nongaussianity.tex as the main .tex file


 12%|█▏        | 3/26 [00:53<07:53, 20.60s/it]

Found 2 .tex files in 2205.06818
They are: ['../data/papers/2205.06818/main.tex', '../data/papers/2205.06818/Resummation EFPs/main.tex']
Using ../data/papers/2205.06818/main.tex as the main .tex file


 15%|█▌        | 4/26 [00:57<05:14, 14.28s/it]

Found 1 .tex files in 2211.13519
They are: ['../data/papers/2211.13519/main.tex']
Using ../data/papers/2211.13519/main.tex as the main .tex file


 19%|█▉        | 5/26 [01:10<04:47, 13.70s/it]

Found 2 .tex files in 2109.06065
They are: ['../data/papers/2109.06065/main.tex', '../data/papers/2109.06065/UnveilingHiddenPhysicsAtTheLHC _v1/main.tex']
Using ../data/papers/2109.06065/UnveilingHiddenPhysicsAtTheLHC _v1/main.tex as the main .tex file


 23%|██▎       | 6/26 [01:11<03:08,  9.44s/it]

Found 1 .tex files in 2109.13243
They are: ['../data/papers/2109.13243/paper.tex']
Using ../data/papers/2109.13243/paper.tex as the main .tex file


 27%|██▋       | 7/26 [01:17<02:36,  8.26s/it]

Found 1 .tex files in 2111.09914
They are: ['../data/papers/2111.09914/ridgepaperALEPH.tex']
Using ../data/papers/2111.09914/ridgepaperALEPH.tex as the main .tex file


 31%|███       | 8/26 [01:59<05:41, 19.00s/it]

Found 1 .tex files in 2205.04459
They are: ['../data/papers/2205.04459/open_data_topics_prd.tex']
Using ../data/papers/2205.04459/open_data_topics_prd.tex as the main .tex file


 35%|███▍      | 9/26 [02:00<03:45, 13.29s/it]

Found 1 .tex files in 2205.05084
They are: ['../data/papers/2205.05084/main.tex']
Using ../data/papers/2205.05084/main.tex as the main .tex file


 38%|███▊      | 10/26 [02:01<02:34,  9.66s/it]

Found 1 .tex files in 2205.02814
They are: ['../data/papers/2205.02814/quantum_thrust.tex']
Using ../data/papers/2205.02814/quantum_thrust.tex as the main .tex file


 42%|████▏     | 11/26 [02:03<01:51,  7.41s/it]

Found 1 .tex files in 2201.07800
They are: ['../data/papers/2201.07800/EEC_opendata_v6.tex']
Using ../data/papers/2201.07800/EEC_opendata_v6.tex as the main .tex file


 46%|████▌     | 12/26 [02:05<01:20,  5.76s/it]

Found 1 .tex files in 2107.08979
They are: ['../data/papers/2107.08979/main.tex']
Using ../data/papers/2107.08979/main.tex as the main .tex file


 50%|█████     | 13/26 [02:28<02:20, 10.84s/it]

Found 1 .tex files in 2205.10375
They are: ['../data/papers/2205.10375/qubo_lzero.tex']
Using ../data/papers/2205.10375/qubo_lzero.tex as the main .tex file


 54%|█████▍    | 14/26 [02:29<01:33,  7.77s/it]

Found 1 .tex files in 2205.03413
They are: ['../data/papers/2205.03413/main.tex']
Using ../data/papers/2205.03413/main.tex as the main .tex file


 58%|█████▊    | 15/26 [02:31<01:06,  6.07s/it]

Found 1 .tex files in 2101.07263
They are: ['../data/papers/2101.07263/main.tex']
Using ../data/papers/2101.07263/main.tex as the main .tex file


 62%|██████▏   | 16/26 [02:43<01:18,  7.85s/it]

Found 4 .tex files in 2301.08128
They are: ['../data/papers/2301.08128/epic.tex', '../data/papers/2301.08128/SciPostPhys_Template.tex', '../data/papers/2301.08128/old_JHEP/_paper.tex', '../data/papers/2301.08128/old_JHEP/jhepexample.tex']
Using ../data/papers/2301.08128/epic.tex as the main .tex file


 65%|██████▌   | 17/26 [02:44<00:51,  5.76s/it]

Found 12 .tex files in 2203.08805
They are: ['../data/papers/2203.08805/main.tex', '../data/papers/2203.08805/Sections/challenges.tex', '../data/papers/2203.08805/Sections/classification.tex', '../data/papers/2203.08805/Sections/data_generation.tex', '../data/papers/2203.08805/Sections/outlook.tex', '../data/papers/2203.08805/Sections/extra_text.tex', '../data/papers/2203.08805/Sections/object_reconstruction.tex', '../data/papers/2203.08805/Sections/quantum_computing.tex', '../data/papers/2203.08805/Sections/info_processing.tex', '../data/papers/2203.08805/Sections/VQCcircuit.tex', '../data/papers/2203.08805/Sections/introduction.tex', '../data/papers/2203.08805/Sections/quantum_inspired_algos.tex']
Using ../data/papers/2203.08805/Sections/classification.tex as the main .tex file


 69%|██████▉   | 18/26 [02:44<00:33,  4.22s/it]

Found 2 .tex files in 2105.04448
They are: ['../data/papers/2105.04448/math_commands.tex', '../data/papers/2105.04448/iclr2021_conference.tex']
Using ../data/papers/2105.04448/iclr2021_conference.tex as the main .tex file


 73%|███████▎  | 19/26 [02:55<00:43,  6.21s/it]

Found 3 .tex files in 2203.07460
They are: ['../data/papers/2203.07460/include_shortcuts.tex', '../data/papers/2203.07460/include_settings.tex', '../data/papers/2203.07460/scipost_new.tex']
Using ../data/papers/2203.07460/scipost_new.tex as the main .tex file


 77%|███████▋  | 20/26 [02:56<00:27,  4.57s/it]

Found 3 .tex files in 2209.07559
They are: ['../data/papers/2209.07559/main.tex', '../data/papers/2209.07559/settings-scipost.tex', '../data/papers/2209.07559/shortcuts.tex']
Using ../data/papers/2209.07559/main.tex as the main .tex file


 81%|████████  | 21/26 [02:59<00:21,  4.25s/it]

Found 1 .tex files in 2302.12266
They are: ['../data/papers/2302.12266/main.tex']
Using ../data/papers/2302.12266/main.tex as the main .tex file


 85%|████████▍ | 22/26 [04:46<02:19, 34.85s/it]

Found 42 .tex files in 2203.07622
They are: ['../data/papers/2203.07622/ILC2022report.tex', '../data/papers/2203.07622/authors/AuthorAddresses.tex', '../data/papers/2203.07622/authors/mymacros.tex', '../data/papers/2203.07622/authors/AuthorMacros.tex', '../data/papers/2203.07622/authors/Authors.tex', '../data/papers/2203.07622/chapters/gen-phys/gen-phys.tex', '../data/papers/2203.07622/chapters/gen-phys/figures/gen-phys.tex', '../data/papers/2203.07622/chapters/PEW/PEW-Wmass.tex', '../data/papers/2203.07622/chapters/PEW/PEW-WZmasses.tex', '../data/papers/2203.07622/chapters/PEW/PEW-summary.tex', '../data/papers/2203.07622/chapters/PEW/PEW-Wbfs.tex', '../data/papers/2203.07622/chapters/PEW/PEW-intro.tex', '../data/papers/2203.07622/chapters/PEW/PEW.tex', '../data/papers/2203.07622/chapters/PEW/PEW-ff.tex', '../data/papers/2203.07622/chapters/PEW/PEW-radreturn.tex', '../data/papers/2203.07622/chapters/farfuture/multi10.tex', '../data/papers/2203.07622/chapters/farfuture/farfuture.tex', '

 88%|████████▊ | 23/26 [04:59<01:25, 28.50s/it]

Found 5 .tex files in 2212.10659
They are: ['../data/papers/2212.10659/main.tex', '../data/papers/2212.10659/params_table.tex', '../data/papers/2212.10659/table.tex', '../data/papers/2212.10659/latent_table.tex', '../data/papers/2212.10659/timer.tex']
Using ../data/papers/2212.10659/main.tex as the main .tex file


 92%|█████████▏| 24/26 [05:07<00:44, 22.20s/it]

Found 1 .tex files in 2008.08596
They are: ['../data/papers/2008.08596/topics_draft.tex']
Using ../data/papers/2008.08596/topics_draft.tex as the main .tex file


 96%|█████████▌| 25/26 [05:08<00:15, 15.76s/it]

Found 1 .tex files in 2010.11998
They are: ['../data/papers/2010.11998/main_v2_arXiv.tex']
Using ../data/papers/2010.11998/main_v2_arXiv.tex as the main .tex file


100%|██████████| 26/26 [05:24<00:00, 12.49s/it]

Found 1 .tex files in 2101.08320
They are: ['../data/papers/2101.08320/merge_rewrite.tex']
Using ../data/papers/2101.08320/merge_rewrite.tex as the main .tex file





In [5]:
# Delete all files except PDFs and TeX files; load relevant files list
delete_files_except_extensions(pdf_dir, ['.pdf', '.tex'])

In [19]:
filenames = get_filenames_with_extensions(pdf_dir, ['.tex', '.pdf'])

In [20]:
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        if os.path.splitext(file)[-1] == '.pdf':
            loader = PyPDFLoader("{}/{}".format(pdf_dir, file))
            pages = loader.load_and_split()
            text.append(''.join([page.page_content for page in pages]))
            source_type.append("paper")
        elif os.path.splitext(file)[-1] == '.tex':
            with open("{}/{}".format(pdf_dir, file), 'r', encoding='iso-8859-1') as f:
                text.append(remove_latex_preamble(f.read()))
                source_type.append("paper")
    except:
        print("Error with file {}".format(file))

100%|██████████| 26/26 [00:00<00:00, 4898.13it/s]


In [21]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

In [22]:
# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

## 2. YouTube videos

In [23]:
# from tqdm import tqdm
# import whisper
# import pytube
# from pathlib import Path
# import subprocess
# import numpy as np

In [24]:
# videos_dir = "../data/videos/"

# try:
#     os.makedirs(videos_dir)
# except FileExistsError:
#     print(f"Directory '{videos_dir}' already exists")

In [25]:
# # Get whisper model; download weights if necessary
# whisper_model = whisper.load_model("tiny.en").to('cpu')
# options = whisper.DecodingOptions(language="en", without_timestamps=True)

# url = "https://www.youtube.com/watch?v=dqxdPNzBY0I"
# pytube_vid = pytube.YouTube(url)
# video_path_local = Path(videos_dir).resolve() / (pytube_vid.video_id+".mp4")
# pytube_vid.streams.filter(type="audio", mime_type="audio/mp4", abr="48kbps").first().download(output_path=video_path_local.parent, filename=video_path_local.name)
# video_path_local = video_path_local.with_suffix(".wav")
# result  = subprocess.run(["ffmpeg", "-i", str(video_path_local.with_suffix(".mp4")), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(video_path_local)])
# transcription = whisper.transcribe(whisper_model, str(video_path_local))

## 3. Interviews

In [26]:
filenames = get_filenames_with_extensions(txt_dir, ['.txt'])

In [27]:
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        with open("{}/{}".format(txt_dir, file), 'r') as f:
            text.append(f.read())
            source_type.append("interview")
    except:
        print("Error with file {}".format(file))

100%|██████████| 6/6 [00:00<00:00, 5420.16it/s]


In [28]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)


## 4. Website/CV

In [29]:
websites = ["https://jthaler.net/group", "https://jthaler.net/research", "https://jthaler.net/engagement", "https://jthaler.net/faq", "https://jthaler.net/cv", "https://jthaler.net/contact"]
text_website = [scrape_website_text(website) for website in tqdm(websites)]

100%|██████████| 6/6 [00:01<00:00,  4.75it/s]


In [30]:
data = [len(text_website) * ["website"], text_website]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

In [31]:
len(df)

38