In [56]:
import pandas as pd
import glob
from datetime import datetime

from bs4 import BeautifulSoup

In [57]:
INPUT_PATH = "data/isw/raw"
OUTPUT_PATH = "data/isw/reports.csv"
BASE_URL = "https://www.understandingwar.org"

In [58]:
reports = glob.glob(f"{INPUT_PATH}/*.html")

In [59]:
data = []

for file in reports:
    file_name_parts = file.split("/")[-1].split("__")
    date = datetime.strptime(file_name_parts[0], "%d_%m_%Y")
    
    with open(file, "r") as cfile:
        html = BeautifulSoup(cfile.read())
        page_title = html.head.find("title").text
        url = BASE_URL + html.head.find("link", attrs={"rel": "canonical"}, href=True).attrs["href"]
        
        content_title = html.body.find("h1", attrs={"id": "page-title"}).text
        content_html = html.body.find("div", attrs={"class": "field-type-text-with-summary"})
        
        d = {
            "date": date,
            "url": url,
            "page_title": page_title,
            "content_title": content_title,
            "content_html": content_html
        }
        
        data.append(d)

In [60]:
df = pd.DataFrame.from_dict(data).sort_values(by=['date'])

In [61]:
df.head(10)

Unnamed: 0,date,url,page_title,content_title,content_html
138,2022-02-25,https://www.understandingwar.org/backgrounder/...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"[[[ , <p align=""center"" style=""text-align: lef..."
126,2022-02-26,https://www.understandingwar.org/backgrounder/...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"[[[ , <p align=""center"" style=""text-align: lef..."
119,2022-02-27,https://www.understandingwar.org/backgrounder/...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"[[[ , <p style=""text-align: left;""><strong><sp..."
11,2022-02-28,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...","[[[ , <p align=""center"" style=""text-align: lef..."
227,2022-03-01,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1","[[[ , <p align=""center"" style=""text-align: lef..."
199,2022-03-02,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, March 2...","Russian Offensive Campaign Assessment, March 2","[[[ , <p align=""center"" style=""text-align: lef..."
213,2022-03-03,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, March 3...","Russian Offensive Campaign Assessment, March 3","[[[ , <p align=""center"" style=""text-align: lef..."
195,2022-03-04,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, March 4...","Russian Offensive Campaign Assessment, March 4","[[[ , <p align=""center"" style=""text-align: lef..."
179,2022-03-05,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, March 5...","Russian Offensive Campaign Assessment, March 5","[[[ , <p align=""center"" style=""text-align: lef..."
160,2022-03-06,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, March 6...","Russian Offensive Campaign Assessment, March 6","[[[ , <p align=""center"" style=""text-align: lef..."


In [62]:
df.to_csv(f"{OUTPUT_PATH}", sep=";", index=False)