In [3]:
# ! pip install git+https://github.com/openai/whisper.git
# ! pip install pytube
# ! pip install pypdf langchain

## 1. Papers

In [42]:
import os
import sys
sys.path.append("../")

import pandas as pd
from tqdm import tqdm

from langchain.document_loaders import PyPDFLoader

from utils.arxiv_utils import get_inspire_hep_papers, extract_arxiv_ids, download_arxiv_source, delete_files_except_extensions, get_filenames_with_extensions
from utils.db_utils import update_dataframe

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
pdf_dir = '../data/papers/'
db_dir = '../data/db/'
txt_dir = '../data/interviews/'

In [5]:

try:
    os.makedirs(pdf_dir)
except FileExistsError:
    print(f"Directory '{pdf_dir}' already exists")

# Get papers from INSPIRE-HEP
papers = get_inspire_hep_papers("Jesse.Thaler.1") 

# Extract arXiv IDs from papers
arxiv_ids = extract_arxiv_ids(papers)

# Download papers (sources if available, otherwise PDFs)
[download_arxiv_source(arxiv_id, output_dir=pdf_dir) for arxiv_id in tqdm(arxiv_ids)];  

# Delete all files except PDFs and TeX files; load relevant files list
delete_files_except_extensions(pdf_dir, ['.pdf', '.tex'])
filenames = get_filenames_with_extensions(pdf_dir, ['.tex', '.pdf'])


In [None]:
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        if os.path.splitext(file)[-1] == '.pdf':
            loader = PyPDFLoader("{}/{}".format(pdf_dir, file))
            pages = loader.load_and_split()
            text.append(''.join([page.page_content for page in pages]))
            source_type.append("paper")
        elif os.path.splitext(file)[-1] == '.tex':
            with open("{}/{}".format(pdf_dir, file), 'r', encoding='iso-8859-1') as f:
                text.append(f.read())
                source_type.append("paper")
    except:
        print("Error with file {}".format(file))

  0%|          | 0/125 [00:00<?, ?it/s]

Error with file hep-ph_0604192.pdf


100%|██████████| 125/125 [00:16<00:00,  7.56it/s]


In [None]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

In [None]:
# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

## 2. YouTube videos

In [6]:
# from tqdm import tqdm
# import whisper
# import pytube
# from pathlib import Path
# import subprocess
# import numpy as np

In [7]:
# videos_dir = "../data/videos/"

# try:
#     os.makedirs(videos_dir)
# except FileExistsError:
#     print(f"Directory '{videos_dir}' already exists")

In [8]:
# # Get whisper model; download weights if necessary
# whisper_model = whisper.load_model("tiny.en").to('cpu')
# options = whisper.DecodingOptions(language="en", without_timestamps=True)

# url = "https://www.youtube.com/watch?v=dqxdPNzBY0I"
# pytube_vid = pytube.YouTube(url)
# video_path_local = Path(videos_dir).resolve() / (pytube_vid.video_id+".mp4")
# pytube_vid.streams.filter(type="audio", mime_type="audio/mp4", abr="48kbps").first().download(output_path=video_path_local.parent, filename=video_path_local.name)
# video_path_local = video_path_local.with_suffix(".wav")
# result  = subprocess.run(["ffmpeg", "-i", str(video_path_local.with_suffix(".mp4")), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(video_path_local)])
# transcription = whisper.transcribe(whisper_model, str(video_path_local))

## 3. Interviews

In [31]:
filenames = get_filenames_with_extensions(txt_dir, ['.txt'])

In [12]:
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        with open("{}/{}".format(txt_dir, file), 'r') as f:
            text.append(f.read())
            source_type.append("interview")
    except:
        print("Error with file {}".format(file))

100%|██████████| 6/6 [00:00<00:00, 7292.33it/s]


In [40]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)


In [41]:
pd.read_csv('{}/df_text.csv'.format(db_dir))

Unnamed: 0,source_type,text
0,paper,"%\RequirePackage{lineno}\n\documentclass[aps,p..."
1,paper,"\documentclass[aps,prd,floatfix,preprintnumber..."
2,paper,"\documentclass[letterpaper,11pt]{article}\n\pd..."
3,paper,"\documentclass[11pt,letterpaper]{article}\n\pd..."
4,paper,"\documentclass[aps,twocolumn,nofootinbib,super..."
...,...,...
125,interview,"The Future of Particle Physics is ""Open""\nGues..."
126,interview,Can we trust physics decisions made by machine...
127,interview,Can a Computer Devise a Theory of Everything?”...
128,interview,Jesse Thaler is a theoretical particle physici...


## 4. Website/CV