In [3]:
import os
import sys
sys.path.append("../")

import pandas as pd
from tqdm import tqdm

from langchain.document_loaders import PyPDFLoader

from utils.arxiv_utils import get_inspire_hep_papers, extract_arxiv_ids, download_arxiv_source, remove_latex_preamble
from utils.db_utils import update_dataframe, delete_files_except_extensions, get_filenames_with_extensions, scrape_website_text

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
pdf_dir = '../data/papers/'  # directory to store PDFs
db_dir = '../data/db/'  # directory to store database CSVs
txt_dir = '../data/interviews/'  # directory to store interview transcripts

## 1. Papers

In [5]:
try:
    os.makedirs(pdf_dir)
except FileExistsError:
    print(f"Directory '{pdf_dir}' already exists")

# Get papers from INSPIRE-HEP, with a year cutoff
papers = get_inspire_hep_papers("Jesse.Thaler.1",year_cutoff=2018) 

# Extract arXiv IDs from papers
arxiv_ids = extract_arxiv_ids(papers) + ["1402.2657", "1011.2268"]

# Download papers (sources if available, otherwise PDFs)
[download_arxiv_source(arxiv_id, output_dir=pdf_dir) for arxiv_id in tqdm(arxiv_ids)];  

Directory '../data/papers/' already exists


  2%|█                                                        | 1/55 [00:10<09:51, 10.95s/it]

Found 1 .tex files in 1908.08542
They are: ['../data/papers/1908.08542/mod_emd.tex']
Using ../data/papers/1908.08542/mod_emd.tex as the main .tex file


  4%|██                                                       | 2/55 [00:13<05:28,  6.20s/it]

Found 1 .tex files in 2004.04159
They are: ['../data/papers/2004.04159/eventgeometry.tex']
Using ../data/papers/2004.04159/eventgeometry.tex as the main .tex file


  5%|███                                                      | 3/55 [00:16<03:48,  4.39s/it]

Found 1 .tex files in 2101.07263
They are: ['../data/papers/2101.07263/main.tex']
Using ../data/papers/2101.07263/main.tex as the main .tex file


  7%|████▏                                                    | 4/55 [00:16<02:30,  2.95s/it]

Found 1 .tex files in 1810.11032
They are: ['../data/papers/1810.11032/NavigatingCollinearSuperspace_v3.tex']
Using ../data/papers/1810.11032/NavigatingCollinearSuperspace_v3.tex as the main .tex file


  9%|█████▏                                                   | 5/55 [00:17<01:49,  2.18s/it]

Found 1 .tex files in 1911.04491
They are: ['../data/papers/1911.04491/efms.tex']
Using ../data/papers/1911.04491/efms.tex as the main .tex file


 11%|██████▏                                                  | 6/55 [00:24<03:01,  3.71s/it]

Found 1 .tex files in 2008.08596
They are: ['../data/papers/2008.08596/topics_draft.tex']
Using ../data/papers/2008.08596/topics_draft.tex as the main .tex file


 13%|███████▎                                                 | 7/55 [00:32<04:00,  5.02s/it]

Found 1 .tex files in 2112.05722
They are: ['../data/papers/2112.05722/main.tex']
Using ../data/papers/2112.05722/main.tex as the main .tex file


 15%|████████▎                                                | 8/55 [00:48<06:44,  8.61s/it]

Found 1 .tex files in 2205.02857
They are: ['../data/papers/2205.02857/celestial_nongaussianity.tex']
Using ../data/papers/2205.02857/celestial_nongaussianity.tex as the main .tex file


 16%|█████████▎                                               | 9/55 [00:57<06:38,  8.67s/it]

Found 1 .tex files in 1902.02346
They are: ['../data/papers/1902.02346/emd.tex']
Using ../data/papers/1902.02346/emd.tex as the main .tex file


 18%|██████████▏                                             | 10/55 [01:14<08:28, 11.29s/it]

Found 2 .tex files in 2205.06818
They are: ['../data/papers/2205.06818/main.tex', '../data/papers/2205.06818/Resummation EFPs/main.tex']
Using ../data/papers/2205.06818/main.tex as the main .tex file


 20%|███████████▏                                            | 11/55 [01:19<06:51,  9.36s/it]

Found 1 .tex files in 1911.09107
They are: ['../data/papers/1911.09107/main.tex']
Using ../data/papers/1911.09107/main.tex as the main .tex file


 22%|████████████▏                                           | 12/55 [01:25<05:55,  8.27s/it]

Found 1 .tex files in 1901.10652
They are: ['../data/papers/1901.10652/ABRA_Technical_PRD_2019_singleFile.tex']
Using ../data/papers/1901.10652/ABRA_Technical_PRD_2019_singleFile.tex as the main .tex file


 24%|█████████████▏                                          | 13/55 [01:25<04:11,  5.99s/it]

Found 2 .tex files in 1908.08949
They are: ['../data/papers/1908.08949/quantum_thrust_arXiv_v2.tex', '../data/papers/1908.08949/Qcircuit.tex']
Using ../data/papers/1908.08949/quantum_thrust_arXiv_v2.tex as the main .tex file


 25%|██████████████▎                                         | 14/55 [01:26<03:02,  4.45s/it]

Found 3 .tex files in 2003.07868
They are: ['../data/papers/2003.07868/jdefs.tex', '../data/papers/2003.07868/report.tex', '../data/papers/2003.07868/authors_all_alphabetical.tex']
Using ../data/papers/2003.07868/report.tex as the main .tex file


 27%|███████████████▎                                        | 15/55 [01:30<02:53,  4.34s/it]

Found 1 .tex files in 2211.13519
They are: ['../data/papers/2211.13519/main.tex']
Using ../data/papers/2211.13519/main.tex as the main .tex file


 29%|████████████████▎                                       | 16/55 [01:34<02:36,  4.01s/it]

Found 1 .tex files in 1906.00489
They are: ['../data/papers/1906.00489/ridgepaperALEPH.tex']
Using ../data/papers/1906.00489/ridgepaperALEPH.tex as the main .tex file


 31%|█████████████████▎                                      | 17/55 [01:36<02:12,  3.47s/it]

Found 1 .tex files in 1810.12257
They are: ['../data/papers/1810.12257/ABRA_PRL_2018_singleFile.tex']
Using ../data/papers/1810.12257/ABRA_PRL_2018_singleFile.tex as the main .tex file


 33%|██████████████████▎                                     | 18/55 [01:38<01:53,  3.07s/it]

Found 1 .tex files in 1902.04222
They are: ['../data/papers/1902.04222/mumu_v2p2.tex']
Using ../data/papers/1902.04222/mumu_v2p2.tex as the main .tex file


 35%|███████████████████▎                                    | 19/55 [01:44<02:24,  4.01s/it]

Found 1 .tex files in 1812.05111
They are: ['../data/papers/1812.05111/quantile_v2.tex']
Using ../data/papers/1812.05111/quantile_v2.tex as the main .tex file


 36%|████████████████████▎                                   | 20/55 [01:51<02:50,  4.86s/it]

Found 2 .tex files in 2109.06065
They are: ['../data/papers/2109.06065/main.tex', '../data/papers/2109.06065/UnveilingHiddenPhysicsAtTheLHC _v1/main.tex']
Using ../data/papers/2109.06065/UnveilingHiddenPhysicsAtTheLHC _v1/main.tex as the main .tex file


 38%|█████████████████████▍                                  | 21/55 [01:52<02:07,  3.74s/it]

Found 1 .tex files in 2109.13243
They are: ['../data/papers/2109.13243/paper.tex']
Using ../data/papers/2109.13243/paper.tex as the main .tex file


 40%|██████████████████████▍                                 | 22/55 [01:53<01:35,  2.89s/it]

Found 1 .tex files in 2205.03413
They are: ['../data/papers/2205.03413/main.tex']
Using ../data/papers/2205.03413/main.tex as the main .tex file


 42%|███████████████████████▍                                | 23/55 [01:55<01:25,  2.66s/it]

Found 1 .tex files in 2107.08979
They are: ['../data/papers/2107.08979/main.tex']
Using ../data/papers/2107.08979/main.tex as the main .tex file


 44%|████████████████████████▍                               | 24/55 [02:13<03:46,  7.29s/it]

Found 1 .tex files in 2205.10375
They are: ['../data/papers/2205.10375/qubo_lzero.tex']
Using ../data/papers/2205.10375/qubo_lzero.tex as the main .tex file


 45%|█████████████████████████▍                              | 25/55 [02:15<02:46,  5.57s/it]

Found 1 .tex files in 2201.07800
They are: ['../data/papers/2201.07800/EEC_opendata_v6.tex']
Using ../data/papers/2201.07800/EEC_opendata_v6.tex as the main .tex file


 47%|██████████████████████████▍                             | 26/55 [02:16<02:03,  4.27s/it]

Found 1 .tex files in 2010.11998
They are: ['../data/papers/2010.11998/main_v2_arXiv.tex']
Using ../data/papers/2010.11998/main_v2_arXiv.tex as the main .tex file


 49%|███████████████████████████▍                            | 27/55 [02:43<05:12, 11.17s/it]

Found 1 .tex files in 2205.04459
They are: ['../data/papers/2205.04459/open_data_topics_prd.tex']
Using ../data/papers/2205.04459/open_data_topics_prd.tex as the main .tex file


 51%|████████████████████████████▌                           | 28/55 [02:44<03:40,  8.16s/it]

Found 1 .tex files in 1909.00009
They are: ['../data/papers/1909.00009/Circumnavigating_Collinear_Superspace_arXivv3.tex']
Using ../data/papers/1909.00009/Circumnavigating_Collinear_Superspace_arXivv3.tex as the main .tex file


 53%|█████████████████████████████▌                          | 29/55 [02:46<02:42,  6.23s/it]

Found 1 .tex files in 2205.02814
They are: ['../data/papers/2205.02814/quantum_thrust.tex']
Using ../data/papers/2205.02814/quantum_thrust.tex as the main .tex file


 55%|██████████████████████████████▌                         | 30/55 [02:53<02:38,  6.36s/it]

Found 1 .tex files in 2111.09914
They are: ['../data/papers/2111.09914/ridgepaperALEPH.tex']
Using ../data/papers/2111.09914/ridgepaperALEPH.tex as the main .tex file


 56%|███████████████████████████████▌                        | 31/55 [02:54<01:56,  4.84s/it]

Found 1 .tex files in 1809.01656
They are: ['../data/papers/1809.01656/axion_interferometry.tex']
Using ../data/papers/1809.01656/axion_interferometry.tex as the main .tex file


 58%|████████████████████████████████▌                       | 32/55 [02:55<01:23,  3.62s/it]

Found 1 .tex files in 2205.05084
They are: ['../data/papers/2205.05084/main.tex']
Using ../data/papers/2205.05084/main.tex as the main .tex file


 60%|█████████████████████████████████▌                      | 33/55 [02:57<01:07,  3.06s/it]

Found 12 .tex files in 2203.08805
They are: ['../data/papers/2203.08805/main.tex', '../data/papers/2203.08805/Sections/challenges.tex', '../data/papers/2203.08805/Sections/classification.tex', '../data/papers/2203.08805/Sections/data_generation.tex', '../data/papers/2203.08805/Sections/outlook.tex', '../data/papers/2203.08805/Sections/extra_text.tex', '../data/papers/2203.08805/Sections/object_reconstruction.tex', '../data/papers/2203.08805/Sections/quantum_computing.tex', '../data/papers/2203.08805/Sections/info_processing.tex', '../data/papers/2203.08805/Sections/VQCcircuit.tex', '../data/papers/2203.08805/Sections/introduction.tex', '../data/papers/2203.08805/Sections/quantum_inspired_algos.tex']
Using ../data/papers/2203.08805/Sections/classification.tex as the main .tex file


 62%|██████████████████████████████████▌                     | 34/55 [03:04<01:34,  4.49s/it]

Found 4 .tex files in 2301.08128
They are: ['../data/papers/2301.08128/epic.tex', '../data/papers/2301.08128/SciPostPhys_Template.tex', '../data/papers/2301.08128/old_JHEP/_paper.tex', '../data/papers/2301.08128/old_JHEP/jhepexample.tex']
Using ../data/papers/2301.08128/epic.tex as the main .tex file


 64%|███████████████████████████████████▋                    | 35/55 [03:25<03:03,  9.19s/it]

Found 55 .tex files in 1803.07977
They are: ['../data/papers/1803.07977/LH17_intro.tex', '../data/papers/1803.07977/LH17.tex', '../data/papers/1803.07977/LH17_Conveners.tex', '../data/papers/1803.07977/LH17_authors.tex', '../data/papers/1803.07977/LH17_macros.tex', '../data/papers/1803.07977/Higgs_vbf_nnlo/vbf_nnlo.main.tex', '../data/papers/1803.07977/SM_Higgs_jet_R/Higgs_jet_R.main.tex', '../data/papers/1803.07977/SM_ewmerging_ttbar/ewmerging_ttbar.main.tex', '../data/papers/1803.07977/Higgs_twojet_VBF/twojet_VBF.main.tex', '../data/papers/1803.07977/MC_WWbb/wwbb_macros.tex', '../data/papers/1803.07977/MC_WWbb/WWbb.main.tex', '../data/papers/1803.07977/SM_loopnumerical/loopnumerical.main.tex', '../data/papers/1803.07977/Higgs_STXS/STXS.main.tex', '../data/papers/1803.07977/SM_ew_comparison/ew_comparison.setup.tex', '../data/papers/1803.07977/SM_ew_comparison/ew_comparison.MadGraphaMCNLO.tex', '../data/papers/1803.07977/SM_ew_comparison/ew_comparison.introduction.tex', '../data/papers

 65%|████████████████████████████████████▋                   | 36/55 [03:25<02:07,  6.69s/it]

Found 2 .tex files in 2105.04448
They are: ['../data/papers/2105.04448/math_commands.tex', '../data/papers/2105.04448/iclr2021_conference.tex']
Using ../data/papers/2105.04448/iclr2021_conference.tex as the main .tex file


 67%|█████████████████████████████████████▋                  | 37/55 [04:10<05:23, 17.96s/it]

Found 42 .tex files in 2203.07622
They are: ['../data/papers/2203.07622/ILC2022report.tex', '../data/papers/2203.07622/authors/AuthorAddresses.tex', '../data/papers/2203.07622/authors/mymacros.tex', '../data/papers/2203.07622/authors/AuthorMacros.tex', '../data/papers/2203.07622/authors/Authors.tex', '../data/papers/2203.07622/chapters/gen-phys/gen-phys.tex', '../data/papers/2203.07622/chapters/gen-phys/figures/gen-phys.tex', '../data/papers/2203.07622/chapters/PEW/PEW-Wmass.tex', '../data/papers/2203.07622/chapters/PEW/PEW-WZmasses.tex', '../data/papers/2203.07622/chapters/PEW/PEW-summary.tex', '../data/papers/2203.07622/chapters/PEW/PEW-Wbfs.tex', '../data/papers/2203.07622/chapters/PEW/PEW-intro.tex', '../data/papers/2203.07622/chapters/PEW/PEW.tex', '../data/papers/2203.07622/chapters/PEW/PEW-ff.tex', '../data/papers/2203.07622/chapters/PEW/PEW-radreturn.tex', '../data/papers/2203.07622/chapters/farfuture/multi10.tex', '../data/papers/2203.07622/chapters/farfuture/farfuture.tex', '

 69%|██████████████████████████████████████▋                 | 38/55 [04:16<04:04, 14.37s/it]

Found 5 .tex files in 2212.10659
They are: ['../data/papers/2212.10659/main.tex', '../data/papers/2212.10659/params_table.tex', '../data/papers/2212.10659/table.tex', '../data/papers/2212.10659/latent_table.tex', '../data/papers/2212.10659/timer.tex']
Using ../data/papers/2212.10659/main.tex as the main .tex file


 71%|███████████████████████████████████████▋                | 39/55 [04:19<02:57, 11.09s/it]

Found 1 .tex files in 2302.12266
They are: ['../data/papers/2302.12266/main.tex']
Using ../data/papers/2302.12266/main.tex as the main .tex file


 73%|████████████████████████████████████████▋               | 40/55 [04:25<02:21,  9.44s/it]

Found 3 .tex files in 2203.07460
They are: ['../data/papers/2203.07460/include_shortcuts.tex', '../data/papers/2203.07460/include_settings.tex', '../data/papers/2203.07460/scipost_new.tex']
Using ../data/papers/2203.07460/scipost_new.tex as the main .tex file


 75%|█████████████████████████████████████████▋              | 41/55 [04:25<01:35,  6.84s/it]

Found 3 .tex files in 2209.07559
They are: ['../data/papers/2209.07559/main.tex', '../data/papers/2209.07559/settings-scipost.tex', '../data/papers/2209.07559/shortcuts.tex']
Using ../data/papers/2209.07559/main.tex as the main .tex file


 78%|███████████████████████████████████████████▊            | 43/55 [05:28<03:26, 17.22s/it]

Found 1 .tex files in 2004.06125
They are: ['../data/papers/2004.06125/emd_v3.tex']
Using ../data/papers/2004.06125/emd_v3.tex as the main .tex file


 80%|████████████████████████████████████████████▊           | 44/55 [05:36<02:41, 14.70s/it]

Found 1 .tex files in 1712.07124
They are: ['../data/papers/1712.07124/energyflow.tex']
Using ../data/papers/1712.07124/energyflow.tex as the main .tex file


 82%|█████████████████████████████████████████████▊          | 45/55 [05:49<02:19, 13.99s/it]

Found 1 .tex files in 1810.05165
They are: ['../data/papers/1810.05165/efns.tex']
Using ../data/papers/1810.05165/efns.tex as the main .tex file


 84%|██████████████████████████████████████████████▊         | 46/55 [05:49<01:29,  9.96s/it]

Found 1 .tex files in 2007.11586
They are: ['../data/papers/2007.11586/main.tex']
Using ../data/papers/2007.11586/main.tex as the main .tex file


 85%|███████████████████████████████████████████████▊        | 47/55 [05:55<01:10,  8.80s/it]

Found 1 .tex files in 2101.08320
They are: ['../data/papers/2101.08320/merge_rewrite.tex']
Using ../data/papers/2101.08320/merge_rewrite.tex as the main .tex file


 87%|████████████████████████████████████████████████▊       | 48/55 [06:04<01:00,  8.66s/it]

Found 1 .tex files in 1805.11109
They are: ['../data/papers/1805.11109/gtam_v3p2.tex']
Using ../data/papers/1805.11109/gtam_v3p2.tex as the main .tex file


 89%|█████████████████████████████████████████████████▉      | 49/55 [06:06<00:40,  6.67s/it]

Found 1 .tex files in 1709.08705
They are: ['../data/papers/1709.08705/trackObs.tex']
Using ../data/papers/1709.08705/trackObs.tex as the main .tex file


 91%|██████████████████████████████████████████████████▉     | 50/55 [06:07<00:24,  4.97s/it]

Found 1 .tex files in 1805.11622
They are: ['../data/papers/1805.11622/softdropisolation_v2.tex']
Using ../data/papers/1805.11622/softdropisolation_v2.tex as the main .tex file


 93%|███████████████████████████████████████████████████▉    | 51/55 [06:09<00:16,  4.08s/it]

Found 1 .tex files in 1802.00008
They are: ['../data/papers/1802.00008/jettopics.tex']
Using ../data/papers/1802.00008/jettopics.tex as the main .tex file


 95%|████████████████████████████████████████████████████▉   | 52/55 [06:11<00:10,  3.56s/it]

Found 1 .tex files in 1809.01140
They are: ['../data/papers/1809.01140/defineqg.tex']
Using ../data/papers/1809.01140/defineqg.tex as the main .tex file


 96%|█████████████████████████████████████████████████████▉  | 53/55 [06:18<00:09,  4.55s/it]

Found 1 .tex files in 1804.03657
They are: ['../data/papers/1804.03657/rsd_v2.tex']
Using ../data/papers/1804.03657/rsd_v2.tex as the main .tex file


 98%|██████████████████████████████████████████████████████▉ | 54/55 [06:19<00:03,  3.49s/it]

Found 1 .tex files in 1402.2657
They are: ['../data/papers/1402.2657/softdrop_paper.tex']
Using ../data/papers/1402.2657/softdrop_paper.tex as the main .tex file


100%|████████████████████████████████████████████████████████| 55/55 [06:20<00:00,  6.92s/it]

Found 1 .tex files in 1011.2268
They are: ['../data/papers/1011.2268/paper.tex']
Using ../data/papers/1011.2268/paper.tex as the main .tex file





In [6]:
# Delete all files except PDFs and TeX files; load relevant files list
delete_files_except_extensions(pdf_dir, ['.pdf', '.tex'])

In [7]:
filenames = get_filenames_with_extensions(pdf_dir, ['.tex', '.pdf'])

In [8]:
# Get text
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        if os.path.splitext(file)[-1] == '.pdf':
            loader = PyPDFLoader("{}/{}".format(pdf_dir, file))
            pages = loader.load_and_split()
            text.append(''.join([page.page_content for page in pages]))
            source_type.append("paper")
        elif os.path.splitext(file)[-1] == '.tex':
            with open("{}/{}".format(pdf_dir, file), 'r', encoding='iso-8859-1') as f:
                text.append(remove_latex_preamble(f.read()))
                source_type.append("paper")
    except:
        print("Error with file {}".format(file))

100%|████████████████████████████████████████████████████████| 55/55 [00:14<00:00,  3.88it/s]


In [9]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

In [10]:
# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

## 2. YouTube videos

In [11]:
# from tqdm import tqdm
# import whisper
# import pytube
# from pathlib import Path
# import subprocess
# import numpy as np

In [12]:
# videos_dir = "../data/videos/"

# try:
#     os.makedirs(videos_dir)
# except FileExistsError:
#     print(f"Directory '{videos_dir}' already exists")

In [13]:
# # Get whisper model; download weights if necessary
# whisper_model = whisper.load_model("tiny.en").to('cpu')
# options = whisper.DecodingOptions(language="en", without_timestamps=True)

# url = "https://www.youtube.com/watch?v=dqxdPNzBY0I"
# pytube_vid = pytube.YouTube(url)
# video_path_local = Path(videos_dir).resolve() / (pytube_vid.video_id+".mp4")
# pytube_vid.streams.filter(type="audio", mime_type="audio/mp4", abr="48kbps").first().download(output_path=video_path_local.parent, filename=video_path_local.name)
# video_path_local = video_path_local.with_suffix(".wav")
# result  = subprocess.run(["ffmpeg", "-i", str(video_path_local.with_suffix(".mp4")), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(video_path_local)])
# transcription = whisper.transcribe(whisper_model, str(video_path_local))

## 3. Interviews

In [14]:
filenames = get_filenames_with_extensions(txt_dir, ['.txt'])

In [15]:
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        with open("{}/{}".format(txt_dir, file), 'r') as f:
            text.append(f.read())
            source_type.append("interview")
    except:
        print("Error with file {}".format(file))

100%|████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3197.92it/s]


In [16]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

## 4. Website/CV

In [17]:
websites = ["https://jthaler.net/group", "https://jthaler.net/research", "https://jthaler.net/engagement", "https://jthaler.net/faq", "https://jthaler.net/cv", "https://jthaler.net/contact"]
text_website = [scrape_website_text(website) for website in tqdm(websites)]
text_website = [", ".join(filter(None, text.replace("\n", ",").split(","))) for text in text_website]

100%|██████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.22it/s]


In [19]:
data = [len(text_website) * ["website"], text_website]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

In [20]:
len(df)  # Number of context objects (papers, sites, etc)

68