In [41]:
import os
import sys
sys.path.append("../")

import pandas as pd
from tqdm import tqdm

from langchain.document_loaders import PyPDFLoader

from utils.arxiv_utils import get_inspire_hep_papers, extract_arxiv_ids, download_arxiv_source, remove_latex_preamble
from utils.db_utils import update_dataframe, delete_files_except_extensions, get_filenames_with_extensions, scrape_website_text

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
pdf_dir = '../data/papers/'  # directory to store PDFs
db_dir = '../data/db/'  # directory to store database CSVs
txt_dir = '../data/interviews/'  # directory to store interview transcripts

## 1. Papers

In [43]:
try:
    os.makedirs(pdf_dir)
except FileExistsError:
    print(f"Directory '{pdf_dir}' already exists")

# Get papers from INSPIRE-HEP, with a year cutoff
papers = get_inspire_hep_papers("Jesse.Thaler.1",year_cutoff=2018) 

# Extract arXiv IDs from papers
arxiv_ids = extract_arxiv_ids(papers)

# Download papers (sources if available, otherwise PDFs)
[download_arxiv_source(arxiv_id, output_dir=pdf_dir) for arxiv_id in tqdm(arxiv_ids)];  

Directory '../data/papers/' already exists



  0%|                                                                 | 0/53 [00:00<?, ?it/s][A
  2%|█                                                        | 1/53 [00:16<14:39, 16.91s/it][A

Found 1 .tex files in 2008.08596
They are: ['../data/papers/2008.08596/topics_draft.tex']
Using ../data/papers/2008.08596/topics_draft.tex as the main .tex file



  4%|██▏                                                      | 2/53 [00:30<12:44, 15.00s/it][A

Found 1 .tex files in 2112.05722
They are: ['../data/papers/2112.05722/main.tex']
Using ../data/papers/2112.05722/main.tex as the main .tex file



  6%|███▏                                                     | 3/53 [01:08<21:03, 25.27s/it][A

Found 1 .tex files in 2205.02857
They are: ['../data/papers/2205.02857/celestial_nongaussianity.tex']
Using ../data/papers/2205.02857/celestial_nongaussianity.tex as the main .tex file



  8%|████▎                                                    | 4/53 [01:20<16:31, 20.24s/it][A

Found 1 .tex files in 1902.02346
They are: ['../data/papers/1902.02346/emd.tex']
Using ../data/papers/1902.02346/emd.tex as the main .tex file



  9%|█████▍                                                   | 5/53 [01:39<15:48, 19.75s/it][A

Found 1 .tex files in 1901.10652
They are: ['../data/papers/1901.10652/ABRA_Technical_PRD_2019_singleFile.tex']
Using ../data/papers/1901.10652/ABRA_Technical_PRD_2019_singleFile.tex as the main .tex file



 11%|██████▍                                                  | 6/53 [01:40<10:36, 13.53s/it][A

Found 2 .tex files in 1908.08949
They are: ['../data/papers/1908.08949/quantum_thrust_arXiv_v2.tex', '../data/papers/1908.08949/Qcircuit.tex']
Using ../data/papers/1908.08949/quantum_thrust_arXiv_v2.tex as the main .tex file



 13%|███████▌                                                 | 7/53 [02:56<25:49, 33.69s/it][A

Found 2 .tex files in 2205.06818
They are: ['../data/papers/2205.06818/main.tex', '../data/papers/2205.06818/Resummation EFPs/main.tex']
Using ../data/papers/2205.06818/main.tex as the main .tex file



 15%|████████▌                                                | 8/53 [03:01<18:30, 24.67s/it][A

Found 1 .tex files in 1911.09107
They are: ['../data/papers/1911.09107/main.tex']
Using ../data/papers/1911.09107/main.tex as the main .tex file



 17%|█████████▋                                               | 9/53 [03:03<12:49, 17.49s/it][A

Found 3 .tex files in 2003.07868
They are: ['../data/papers/2003.07868/jdefs.tex', '../data/papers/2003.07868/report.tex', '../data/papers/2003.07868/authors_all_alphabetical.tex']
Using ../data/papers/2003.07868/report.tex as the main .tex file



 19%|██████████▌                                             | 10/53 [03:33<15:20, 21.40s/it][A

Found 1 .tex files in 1908.08542
They are: ['../data/papers/1908.08542/mod_emd.tex']
Using ../data/papers/1908.08542/mod_emd.tex as the main .tex file



 21%|███████████▌                                            | 11/53 [03:38<11:24, 16.29s/it][A

Found 1 .tex files in 2004.04159
They are: ['../data/papers/2004.04159/eventgeometry.tex']
Using ../data/papers/2004.04159/eventgeometry.tex as the main .tex file



 23%|████████████▋                                           | 12/53 [03:41<08:25, 12.32s/it][A

Found 1 .tex files in 2101.07263
They are: ['../data/papers/2101.07263/main.tex']
Using ../data/papers/2101.07263/main.tex as the main .tex file



 25%|█████████████▋                                          | 13/53 [03:42<05:56,  8.91s/it][A

Found 1 .tex files in 1810.11032
They are: ['../data/papers/1810.11032/NavigatingCollinearSuperspace_v3.tex']
Using ../data/papers/1810.11032/NavigatingCollinearSuperspace_v3.tex as the main .tex file



 26%|██████████████▊                                         | 14/53 [03:43<04:20,  6.67s/it][A

Found 1 .tex files in 1911.04491
They are: ['../data/papers/1911.04491/efms.tex']
Using ../data/papers/1911.04491/efms.tex as the main .tex file



 28%|███████████████▊                                        | 15/53 [03:45<03:11,  5.04s/it][A

Found 1 .tex files in 2109.13243
They are: ['../data/papers/2109.13243/paper.tex']
Using ../data/papers/2109.13243/paper.tex as the main .tex file



 30%|████████████████▉                                       | 16/53 [03:57<04:28,  7.26s/it][A

Found 1 .tex files in 2211.13519
They are: ['../data/papers/2211.13519/main.tex']
Using ../data/papers/2211.13519/main.tex as the main .tex file



 32%|█████████████████▉                                      | 17/53 [04:01<03:45,  6.28s/it][A

Found 1 .tex files in 1906.00489
They are: ['../data/papers/1906.00489/ridgepaperALEPH.tex']
Using ../data/papers/1906.00489/ridgepaperALEPH.tex as the main .tex file



 34%|███████████████████                                     | 18/53 [04:13<04:42,  8.06s/it][A

Found 1 .tex files in 1902.04222
They are: ['../data/papers/1902.04222/mumu_v2p2.tex']
Using ../data/papers/1902.04222/mumu_v2p2.tex as the main .tex file



 36%|████████████████████                                    | 19/53 [04:15<03:32,  6.25s/it][A

Found 1 .tex files in 1812.05111
They are: ['../data/papers/1812.05111/quantile_v2.tex']
Using ../data/papers/1812.05111/quantile_v2.tex as the main .tex file



 38%|█████████████████████▏                                  | 20/53 [04:46<07:27, 13.57s/it][A

Found 2 .tex files in 2109.06065
They are: ['../data/papers/2109.06065/main.tex', '../data/papers/2109.06065/UnveilingHiddenPhysicsAtTheLHC _v1/main.tex']
Using ../data/papers/2109.06065/UnveilingHiddenPhysicsAtTheLHC _v1/main.tex as the main .tex file



 40%|██████████████████████▏                                 | 21/53 [04:57<06:54, 12.95s/it][A

Found 1 .tex files in 1810.12257
They are: ['../data/papers/1810.12257/ABRA_PRL_2018_singleFile.tex']
Using ../data/papers/1810.12257/ABRA_PRL_2018_singleFile.tex as the main .tex file



 42%|███████████████████████▏                                | 22/53 [05:08<06:20, 12.27s/it][A

Found 1 .tex files in 2111.09914
They are: ['../data/papers/2111.09914/ridgepaperALEPH.tex']
Using ../data/papers/2111.09914/ridgepaperALEPH.tex as the main .tex file



 43%|████████████████████████▎                               | 23/53 [06:53<20:00, 40.01s/it][A

Found 1 .tex files in 2205.04459
They are: ['../data/papers/2205.04459/open_data_topics_prd.tex']
Using ../data/papers/2205.04459/open_data_topics_prd.tex as the main .tex file



 45%|█████████████████████████▎                              | 24/53 [06:54<13:44, 28.43s/it][A

Found 1 .tex files in 1809.01656
They are: ['../data/papers/1809.01656/axion_interferometry.tex']
Using ../data/papers/1809.01656/axion_interferometry.tex as the main .tex file



 47%|██████████████████████████▍                             | 25/53 [06:56<09:31, 20.42s/it][A

Found 1 .tex files in 2205.05084
They are: ['../data/papers/2205.05084/main.tex']
Using ../data/papers/2205.05084/main.tex as the main .tex file



 49%|███████████████████████████▍                            | 26/53 [06:57<06:34, 14.60s/it][A

Found 1 .tex files in 1909.00009
They are: ['../data/papers/1909.00009/Circumnavigating_Collinear_Superspace_arXivv3.tex']
Using ../data/papers/1909.00009/Circumnavigating_Collinear_Superspace_arXivv3.tex as the main .tex file



 51%|████████████████████████████▌                           | 27/53 [07:01<05:00, 11.55s/it][A

Found 1 .tex files in 2205.02814
They are: ['../data/papers/2205.02814/quantum_thrust.tex']
Using ../data/papers/2205.02814/quantum_thrust.tex as the main .tex file



 53%|█████████████████████████████▌                          | 28/53 [07:04<03:38,  8.74s/it][A

Found 1 .tex files in 2201.07800
They are: ['../data/papers/2201.07800/EEC_opendata_v6.tex']
Using ../data/papers/2201.07800/EEC_opendata_v6.tex as the main .tex file



 55%|██████████████████████████████▋                         | 29/53 [07:09<03:09,  7.88s/it][A

Found 1 .tex files in 2107.08979
They are: ['../data/papers/2107.08979/main.tex']
Using ../data/papers/2107.08979/main.tex as the main .tex file



 57%|███████████████████████████████▋                        | 30/53 [08:04<08:24, 21.94s/it][A

Found 1 .tex files in 2205.10375
They are: ['../data/papers/2205.10375/qubo_lzero.tex']
Using ../data/papers/2205.10375/qubo_lzero.tex as the main .tex file



 58%|████████████████████████████████▊                       | 31/53 [08:05<05:43, 15.63s/it][A

Found 1 .tex files in 2205.03413
They are: ['../data/papers/2205.03413/main.tex']
Using ../data/papers/2205.03413/main.tex as the main .tex file



 60%|█████████████████████████████████▊                      | 32/53 [08:08<04:10, 11.93s/it][A

Found 1 .tex files in 2010.11998
They are: ['../data/papers/2010.11998/main_v2_arXiv.tex']
Using ../data/papers/2010.11998/main_v2_arXiv.tex as the main .tex file



 62%|██████████████████████████████████▊                     | 33/53 [08:30<04:56, 14.82s/it][A

Found 4 .tex files in 2301.08128
They are: ['../data/papers/2301.08128/epic.tex', '../data/papers/2301.08128/SciPostPhys_Template.tex', '../data/papers/2301.08128/old_JHEP/_paper.tex', '../data/papers/2301.08128/old_JHEP/jhepexample.tex']
Using ../data/papers/2301.08128/epic.tex as the main .tex file



 64%|███████████████████████████████████▉                    | 34/53 [08:32<03:26, 10.88s/it][A

Found 2 .tex files in 2105.04448
They are: ['../data/papers/2105.04448/math_commands.tex', '../data/papers/2105.04448/iclr2021_conference.tex']
Using ../data/papers/2105.04448/iclr2021_conference.tex as the main .tex file



 66%|████████████████████████████████████▉                   | 35/53 [10:06<10:44, 35.80s/it][A

Found 55 .tex files in 1803.07977
They are: ['../data/papers/1803.07977/LH17_intro.tex', '../data/papers/1803.07977/LH17.tex', '../data/papers/1803.07977/LH17_Conveners.tex', '../data/papers/1803.07977/LH17_authors.tex', '../data/papers/1803.07977/LH17_macros.tex', '../data/papers/1803.07977/Higgs_vbf_nnlo/vbf_nnlo.main.tex', '../data/papers/1803.07977/SM_Higgs_jet_R/Higgs_jet_R.main.tex', '../data/papers/1803.07977/SM_ewmerging_ttbar/ewmerging_ttbar.main.tex', '../data/papers/1803.07977/Higgs_twojet_VBF/twojet_VBF.main.tex', '../data/papers/1803.07977/MC_WWbb/wwbb_macros.tex', '../data/papers/1803.07977/MC_WWbb/WWbb.main.tex', '../data/papers/1803.07977/SM_loopnumerical/loopnumerical.main.tex', '../data/papers/1803.07977/Higgs_STXS/STXS.main.tex', '../data/papers/1803.07977/SM_ew_comparison/ew_comparison.setup.tex', '../data/papers/1803.07977/SM_ew_comparison/ew_comparison.MadGraphaMCNLO.tex', '../data/papers/1803.07977/SM_ew_comparison/ew_comparison.introduction.tex', '../data/papers


 68%|██████████████████████████████████████                  | 36/53 [10:11<07:32, 26.60s/it][A

Found 12 .tex files in 2203.08805
They are: ['../data/papers/2203.08805/main.tex', '../data/papers/2203.08805/Sections/challenges.tex', '../data/papers/2203.08805/Sections/classification.tex', '../data/papers/2203.08805/Sections/data_generation.tex', '../data/papers/2203.08805/Sections/outlook.tex', '../data/papers/2203.08805/Sections/extra_text.tex', '../data/papers/2203.08805/Sections/object_reconstruction.tex', '../data/papers/2203.08805/Sections/quantum_computing.tex', '../data/papers/2203.08805/Sections/info_processing.tex', '../data/papers/2203.08805/Sections/VQCcircuit.tex', '../data/papers/2203.08805/Sections/introduction.tex', '../data/papers/2203.08805/Sections/quantum_inspired_algos.tex']
Using ../data/papers/2203.08805/Sections/classification.tex as the main .tex file



 70%|███████████████████████████████████████                 | 37/53 [14:28<25:30, 95.68s/it][A
 72%|████████████████████████████████████████▏               | 38/53 [14:29<16:52, 67.47s/it][A

Found 3 .tex files in 2209.07559
They are: ['../data/papers/2209.07559/main.tex', '../data/papers/2209.07559/settings-scipost.tex', '../data/papers/2209.07559/shortcuts.tex']
Using ../data/papers/2209.07559/main.tex as the main .tex file



 74%|█████████████████████████████████████████▏              | 39/53 [14:46<12:13, 52.37s/it][A

Found 3 .tex files in 2203.07460
They are: ['../data/papers/2203.07460/include_shortcuts.tex', '../data/papers/2203.07460/include_settings.tex', '../data/papers/2203.07460/scipost_new.tex']
Using ../data/papers/2203.07460/scipost_new.tex as the main .tex file



 75%|██████████████████████████████████████████▎             | 40/53 [14:52<08:19, 38.43s/it][A

Found 1 .tex files in 2302.12266
They are: ['../data/papers/2302.12266/main.tex']
Using ../data/papers/2302.12266/main.tex as the main .tex file



 77%|███████████████████████████████████████████▎            | 41/53 [18:20<17:50, 89.22s/it][A

Found 42 .tex files in 2203.07622
They are: ['../data/papers/2203.07622/ILC2022report.tex', '../data/papers/2203.07622/authors/AuthorAddresses.tex', '../data/papers/2203.07622/authors/mymacros.tex', '../data/papers/2203.07622/authors/AuthorMacros.tex', '../data/papers/2203.07622/authors/Authors.tex', '../data/papers/2203.07622/chapters/gen-phys/gen-phys.tex', '../data/papers/2203.07622/chapters/gen-phys/figures/gen-phys.tex', '../data/papers/2203.07622/chapters/PEW/PEW-Wmass.tex', '../data/papers/2203.07622/chapters/PEW/PEW-WZmasses.tex', '../data/papers/2203.07622/chapters/PEW/PEW-summary.tex', '../data/papers/2203.07622/chapters/PEW/PEW-Wbfs.tex', '../data/papers/2203.07622/chapters/PEW/PEW-intro.tex', '../data/papers/2203.07622/chapters/PEW/PEW.tex', '../data/papers/2203.07622/chapters/PEW/PEW-ff.tex', '../data/papers/2203.07622/chapters/PEW/PEW-radreturn.tex', '../data/papers/2203.07622/chapters/farfuture/multi10.tex', '../data/papers/2203.07622/chapters/farfuture/farfuture.tex', '


 79%|████████████████████████████████████████████▍           | 42/53 [18:50<13:05, 71.42s/it][A

Found 5 .tex files in 2212.10659
They are: ['../data/papers/2212.10659/main.tex', '../data/papers/2212.10659/params_table.tex', '../data/papers/2212.10659/table.tex', '../data/papers/2212.10659/latent_table.tex', '../data/papers/2212.10659/timer.tex']
Using ../data/papers/2212.10659/main.tex as the main .tex file



 81%|█████████████████████████████████████████████▍          | 43/53 [19:12<09:27, 56.75s/it][A

Found 1 .tex files in 1712.07124
They are: ['../data/papers/1712.07124/energyflow.tex']
Using ../data/papers/1712.07124/energyflow.tex as the main .tex file



 83%|██████████████████████████████████████████████▍         | 44/53 [19:54<07:49, 52.11s/it][A

Found 1 .tex files in 1810.05165
They are: ['../data/papers/1810.05165/efns.tex']
Using ../data/papers/1810.05165/efns.tex as the main .tex file



 85%|███████████████████████████████████████████████▌        | 45/53 [20:30<06:19, 47.41s/it][A

Found 1 .tex files in 2101.08320
They are: ['../data/papers/2101.08320/merge_rewrite.tex']
Using ../data/papers/2101.08320/merge_rewrite.tex as the main .tex file



 87%|████████████████████████████████████████████████▌       | 46/53 [20:31<03:54, 33.46s/it][A

Found 1 .tex files in 2007.11586
They are: ['../data/papers/2007.11586/main.tex']
Using ../data/papers/2007.11586/main.tex as the main .tex file



 89%|█████████████████████████████████████████████████▋      | 47/53 [20:58<03:08, 31.44s/it][A

Found 1 .tex files in 2004.06125
They are: ['../data/papers/2004.06125/emd_v3.tex']
Using ../data/papers/2004.06125/emd_v3.tex as the main .tex file



 91%|██████████████████████████████████████████████████▋     | 48/53 [21:00<01:52, 22.57s/it][A

Found 1 .tex files in 1709.08705
They are: ['../data/papers/1709.08705/trackObs.tex']
Using ../data/papers/1709.08705/trackObs.tex as the main .tex file



 92%|███████████████████████████████████████████████████▊    | 49/53 [21:37<01:47, 26.94s/it][A

Found 1 .tex files in 1805.11109
They are: ['../data/papers/1805.11109/gtam_v3p2.tex']
Using ../data/papers/1805.11109/gtam_v3p2.tex as the main .tex file



 94%|████████████████████████████████████████████████████▊   | 50/53 [21:39<00:58, 19.56s/it][A

Found 1 .tex files in 1802.00008
They are: ['../data/papers/1802.00008/jettopics.tex']
Using ../data/papers/1802.00008/jettopics.tex as the main .tex file



 96%|█████████████████████████████████████████████████████▉  | 51/53 [21:41<00:28, 14.24s/it][A

Found 1 .tex files in 1805.11622
They are: ['../data/papers/1805.11622/softdropisolation_v2.tex']
Using ../data/papers/1805.11622/softdropisolation_v2.tex as the main .tex file



 98%|██████████████████████████████████████████████████████▉ | 52/53 [21:56<00:14, 14.33s/it][A

Found 1 .tex files in 1804.03657
They are: ['../data/papers/1804.03657/rsd_v2.tex']
Using ../data/papers/1804.03657/rsd_v2.tex as the main .tex file



100%|████████████████████████████████████████████████████████| 53/53 [21:58<00:00, 24.88s/it][A

Found 1 .tex files in 1809.01140
They are: ['../data/papers/1809.01140/defineqg.tex']
Using ../data/papers/1809.01140/defineqg.tex as the main .tex file





In [44]:
# Delete all files except PDFs and TeX files; load relevant files list
delete_files_except_extensions(pdf_dir, ['.pdf', '.tex'])

In [45]:
filenames = get_filenames_with_extensions(pdf_dir, ['.tex', '.pdf'])

In [46]:
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        if os.path.splitext(file)[-1] == '.pdf':
            loader = PyPDFLoader("{}/{}".format(pdf_dir, file))
            pages = loader.load_and_split()
            text.append(''.join([page.page_content for page in pages]))
            source_type.append("paper")
        elif os.path.splitext(file)[-1] == '.tex':
            with open("{}/{}".format(pdf_dir, file), 'r', encoding='iso-8859-1') as f:
                text.append(remove_latex_preamble(f.read()))
                source_type.append("paper")
    except:
        print("Error with file {}".format(file))


  0%|                                                                 | 0/53 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████| 53/53 [00:14<00:00,  3.63it/s][A


In [49]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

In [50]:
# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

## 2. YouTube videos

In [51]:
# from tqdm import tqdm
# import whisper
# import pytube
# from pathlib import Path
# import subprocess
# import numpy as np

In [52]:
# videos_dir = "../data/videos/"

# try:
#     os.makedirs(videos_dir)
# except FileExistsError:
#     print(f"Directory '{videos_dir}' already exists")

In [53]:
# # Get whisper model; download weights if necessary
# whisper_model = whisper.load_model("tiny.en").to('cpu')
# options = whisper.DecodingOptions(language="en", without_timestamps=True)

# url = "https://www.youtube.com/watch?v=dqxdPNzBY0I"
# pytube_vid = pytube.YouTube(url)
# video_path_local = Path(videos_dir).resolve() / (pytube_vid.video_id+".mp4")
# pytube_vid.streams.filter(type="audio", mime_type="audio/mp4", abr="48kbps").first().download(output_path=video_path_local.parent, filename=video_path_local.name)
# video_path_local = video_path_local.with_suffix(".wav")
# result  = subprocess.run(["ffmpeg", "-i", str(video_path_local.with_suffix(".mp4")), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(video_path_local)])
# transcription = whisper.transcribe(whisper_model, str(video_path_local))

## 3. Interviews

In [54]:
filenames = get_filenames_with_extensions(txt_dir, ['.txt'])

In [55]:
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        with open("{}/{}".format(txt_dir, file), 'r') as f:
            text.append(f.read())
            source_type.append("interview")
    except:
        print("Error with file {}".format(file))


100%|████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 2431.95it/s][A


In [56]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)


## 4. Website/CV

In [57]:
websites = ["https://jthaler.net/group", "https://jthaler.net/research", "https://jthaler.net/engagement", "https://jthaler.net/faq", "https://jthaler.net/cv", "https://jthaler.net/contact"]
text_website = [scrape_website_text(website) for website in tqdm(websites)]


  0%|                                                                  | 0/6 [00:00<?, ?it/s][A
 17%|█████████▋                                                | 1/6 [00:00<00:02,  1.88it/s][A
 33%|███████████████████▎                                      | 2/6 [00:01<00:02,  1.79it/s][A
 50%|█████████████████████████████                             | 3/6 [00:01<00:01,  1.98it/s][A
 67%|██████████████████████████████████████▋                   | 4/6 [00:02<00:01,  1.90it/s][A
 83%|████████████████████████████████████████████████▎         | 5/6 [00:02<00:00,  1.67it/s][A
100%|██████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.97it/s][A


In [58]:
data = [len(text_website) * ["website"], text_website]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

In [59]:
len(df)  # Number of context objects (papers, sites, etc)

65