# Convert NYT to json

In [1]:
# Import
import tarfile
import os
import shutil
import glob
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
# Extract a zip file, return the output folder name
def extract_zip(zip_name):
  # output folder name
  output_folder = zip_name.split(".")[-2]
  # remove old output folder
  if os.path.exists(output_folder):
    shutil.rmtree(output_folder)
  # Extract zip
  nytzipf = tarfile.open(zip_name)
  nytzipf.extractall(output_folder)
  nytzipf.close()
  return output_folder

In [3]:
# Extract zip
main_folder = extract_zip(os.path.join("data", "nyt_corpus_LDC2008T19.tgz"))

In [4]:
# data folder
data_folder = os.path.join(main_folder, "nyt_corpus", "data")

In [5]:
# list of all sub zip data/year/month.zip (data/YYYY/MM.zip)
zip_files = glob.glob(os.path.join(data_folder, "*", "*.tgz"))
zip_files.sort()

In [6]:
# text and abstract lists
docs = []
summaries = []
paths = []

In [7]:
# For each zip
for zip_file in zip_files:
  month_f = extract_zip(zip_file)
  xmls = glob.glob(os.path.join(month_f, "*", "*", "*.xml"))
  xmls.sort()
  for xml in xmls:
    tree = ET.parse(xml)
    root = tree.getroot()
    abstract = root.find(".//abstract")
    if abstract is not None:
      full_text_balise = root.find('.//block[@class="full_text"]')
      if full_text_balise is not None:
        paths.append(xml)
        summaries.append(ET.tostring(abstract, encoding="utf-8").decode("utf-8"))
        docs.append(ET.tostring(full_text_balise, encoding="utf-8").decode("utf-8"))

In [8]:
# number of entries
len(docs)

654872

In [9]:
# save panda dataframe to json
df = pd.DataFrame(data={"paths": paths, "docs": docs, "summaries": summaries})

In [10]:
import re
re_html = re.compile(r'<[^>]+>')
df["docs"] = df["docs"].apply(lambda x : re_html.sub('', x))
df["summaries"] = df["summaries"].apply(lambda x : re_html.sub('', x))

In [11]:
df.to_json(main_folder + ".json")