# NU.nl comment harvester

You need the Python library selenium (pip install selenium) and the geckodriver: https://github.com/mozilla/geckodriver/releases

Instructions:

1. collect the list of relevant urls in a text file (see below)
2. run the first code block to load library imports and read the url file
3. set the start id in the for loop in the fifth code block (first id is 0) (?)
4. run code block 6 and wait
5. the comments are stored in a set of files `[category]-[article_id].csv`

Collect list of relevant URLs:

1. run the second code block to open https://www.nu.nl/tag/Coronavirus
2. run the third code block to get the desired time period on the page
3. copy-paste from browser window to LibreOffice Writer
4. save as Flat ODT xml file (.fodt)
5. `grep -Eo "https://www.nu.nl/[A-Za-z\-]+/[0-9]+/" [copied file].fodt|sort|uniq > [url file].txt`

wrt 3 and 4: this can also be done with Microsoft Word, open the `docx` file in a file extraction program (e.g., WinZip) and locate the relevant `.xml` file. Probably a selenium/xpath solution is also possible for step 3-5.

## code block 1

In [1]:
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from IPython.display import clear_output
import pandas as pd
import re
import time

def id_from_article_url(article_url):
    res = re.search(r"([0-9]+)/", article_url)
    return res.group(1)

def category_from_article_url(article_url):
    res = re.search(r"^https://www.nu.nl/([^\/]+)/", article_url)
    return res.group(1)

def talk_url_from_id(article_id):
    return "https://talk.nu.nl/embed/stream?asset_url=https%3A%2F%2Fwww.nu.nl%2Fartikel%2F"+str(article_id)+"%2Fredirect.html&initialWidth=601&childId=coral_talk_wrapper"

DRIVER = "/usr/bin/geckodriver"

URLFILE = "nunl-urls.txt"
urls_from_file = open(URLFILE, encoding="utf8")
urls_all = urls_from_file.read().splitlines()
urls_from_file.close()

## code block 2

In [2]:
# Download metadata: date, title, abstract, content
import json
import csv
from selenium.webdriver.firefox.options import Options

start_id = 0 # was 1106
write_mode = 'w'
if start_id > 0:
    write_mode = 'a'

options = Options()
options.headless = True

file_out = open('nunl_meta.csv', write_mode, encoding = 'utf8', newline = '')
meta_out = csv.writer(file_out,
                      delimiter = ',',
                      quotechar = '"',
                      doublequote = True,
                      quoting = csv.QUOTE_NONNUMERIC)
if start_id == 0:
    meta_out.writerow(['url','date','title','abstract','body'])

na_out = open('nunl_nometa.txt', 'a')
    #<meta name="og:type" content="video.other" />
    
for i in range(start_id,len(urls_all)):
    clear_output(wait=True)
    print(i, "/", len(urls_all)-1, urls_all[i])
    driver = webdriver.Firefox(options=options, executable_path=DRIVER)
    driver.get(urls_all[i])
    driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")
    node_type = driver.find_element_by_xpath("//head/meta[@name='og:type']")
    txt_type = node_type.get_attribute('content')
    if "video" in txt_type:
        na_out.write(urls_all[i]+"\n")
    else:
        node_title = driver.find_element_by_xpath("//head/meta[@name='title']")
        txt_title = node_title.get_attribute('content')
        node_abstract = driver.find_element_by_xpath("//head/meta[@name='description']")
        txt_abstract = node_abstract.get_attribute('content')
        node_date = driver.find_element_by_xpath("//head/meta[@name='article:published_time']")
        txt_date = node_date.get_attribute('content')
        print(txt_date,txt_title)
        node_content = driver.find_element_by_xpath("//script[@type='application/ld+json']")
        json_content = node_content.get_attribute('innerHTML')
        #print(node_content,json_content)
        json_dict = json.loads(json_content)
        #print(json_dict["articleBody"])
        meta_out.writerow([urls_all[i], txt_date, txt_title, txt_abstract, json_dict["articleBody"]])
    driver.quit()
    
file_out.close()
na_out.close()

0 / 0 https://www.nu.nl/coronavirus/6036016/recordaantal-dagelijkse-besmettingen-in-brazilie.html
2020-03-09T07:33:14+01:00 Dagelijks bijna 300 nieuwe besmettingen in België


## code block 3 (target url collection)

In [None]:
# show all articles from Corona tag page
WEBPAGE = "https://www.nu.nl/tag/Coronavirus"
driver = webdriver.Firefox(executable_path=DRIVER)
driver.get(WEBPAGE)
driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")

## code block 4 target url collection)

In [None]:
# click 'show more articles' a few hundred times
# afterwards copy and paste to extract urls
for i in range(30):
    clear_output(wait=True)
    print(i)
    morearticles_link = driver.find_element_by_xpath("//a[starts-with(@class,'block-more__link')]")
    try:
        morearticles_link.click()
        time.sleep(1)
    except StaleElementReferenceException as e:
        # new xpath executed before DOM update from previous click finished
        print("stale element")
        break

In [None]:
urls_all = []
elementsA = driver.find_elements_by_xpath("//a")
for elementA in elementsA:
    url = elementA.get_attribute('href')
    try:
        if len(re.findall("/",url)) > 4: urls_all.append(url)
    except: pass

In [None]:
print(len(urls_all),urls_all[1],urls_all[-1],sep="\n")

In [None]:
backup = urls_all

## code block 5 (skip)

In [None]:
# show all reactions for single page
WEBPAGE = "https://www.nu.nl/coronavirus/6055448/ggd-deed-ruim-9000-coronatesten-sinds-maandag-tot-nu-toe-94-positief.html" 
WEBPAGE ="https://talk.nu.nl/embed/stream?asset_url=https%3A%2F%2Fwww.nu.nl%2Fartikel%2F6055448%2Fredirect.html&initialWidth=601&childId=coral_talk_wrapper&parentTitle=GGD%20deed%20ruim%209.000%20coronatesten%20sinds%20maandag%2C%20tot%20nu%20toe%2094%20positief%20%7C%20NU%20-%20Het%20laatste%20nieuws%20het%20eerst%20op%20NU.nl&parentUrl=https%3A%2F%2Fwww.nu.nl%2Fcoronavirus%2F6055448%2Fggd-deed-ruim-9000-coronatesten-sinds-maandag-tot-nu-toe-94-positief.html%23coral_talk_wrapper"

driver = webdriver.Firefox(executable_path=DRIVER)
driver.get(WEBPAGE)

morereactions = True
while morereactions:
    reaction_buttons = driver.find_elements_by_xpath("//button[contains(text(),'meer reacties')]")
    print("number of buttons:", len(reaction_buttons))
    if len(reaction_buttons) > 0:
        python_button = reaction_buttons[0]
        try:
            python_button.click()
        except StaleElementReferenceException as e:
            # new xpath executed before DOM update from previous click finished
            pass
    else:
        morereactions = False

## code block 6

In [3]:
URLFILEOLD = "nunl-urls-20200724.csv"
URLFILENEW = "nunl-urls-20200726.csv"

urls_all_old = pd.read_csv(URLFILEOLD,index_col=0)
urls_all = list(pd.read_csv(URLFILENEW,index_col=0)["0"])

In [4]:
import os

COMMENT = r"^Comment__commentContainer"
AUTHORNAME = r"^AuthorName__name"
TIMESTAMP = r"^CommentTimestamp__timestamp"
TEXT = r"^Comment__content"
CLASS = "class"
TITLE = "title"
XPATHID = "../../.."
XPATHPARENTID = "../../../../.."
WAIT = 10

for current_id in range(0,len(urls_all)):
    current_url = urls_all[current_id]
    
    print(current_id, "/", len(urls_all)-1, current_url, "id:", id_from_article_url(current_url), "category:", category_from_article_url(current_url))
    
    article_id = id_from_article_url(current_url)
    article_cat = category_from_article_url(current_url)
    
    outputFile = f"nunl-{article_cat}-{article_id}-{current_id}.csv"
    if os.path.exists(outputFile) or current_url in list(urls_all_old["0"]): continue
    pd.DataFrame({}).to_csv(outputFile)
    
    webPage = talk_url_from_id(article_id)
    
    driver = webdriver.Firefox(executable_path=DRIVER)
    driver.get(webPage)
    driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")
    
    # wait for spinner to finish
    while len(driver.find_elements_by_xpath("//div[starts-with(@class, 'Spinner__container')]")) != 0:
        time.sleep(1)
    
    # click more reactions button until all reactions are shown
    print(f"click more reaction buttons") 
    morereactions = True
    while morereactions:
        reaction_buttons = driver.find_elements_by_xpath("//button[contains(text(),'meer reacties')]")
        print(f"number of buttons:", len(reaction_buttons))
        if len(reaction_buttons) > 0:
            python_button = reaction_buttons[0]
            try:
                python_button.click()
                time.sleep(WAIT)
            except StaleElementReferenceException as e:
                # new xpath executed before DOM update from previous click finished
                pass
        else:
            morereactions = False
            print("no more reactions")
    
    # wait for spinner to finish
    while len(driver.find_elements_by_xpath("//div[starts-with(@class, 'Spinner__container')]")) != 0:
        time.sleep(1)
    
    # locate all reactions and store in csv file
    print(f"locate all reactions")  
    data = []
    alldivs = driver.find_elements_by_xpath("//div[starts-with(@class, 'Comment__commentContainer')]")
    divnr = 0
    lenall = len(alldivs)
    for e in alldivs:
        divnr += 1
        eClass = e.get_attribute(CLASS)
        if re.search(COMMENT,eClass):
            eId = e.find_element_by_xpath(XPATHID).id
            parent = e.find_element_by_xpath(XPATHPARENTID).id
            authorName = ""
            timeStamp = ""
            text = ""
            for f in e.find_elements_by_xpath(".//*"):
                fClass = f.get_attribute(CLASS)
                if fClass != eClass:
                    if re.search(AUTHORNAME,fClass): 
                        authorName = f.text
                    elif re.search(TIMESTAMP,fClass): 
                        timeStamp = f.get_attribute(TITLE)
                    elif re.search(TEXT,fClass): 
                        text = re.sub("\n"," ",f.text)
                        break
            data.append({"id":eId,"name":authorName,"date":timeStamp,"text":text,"parent":parent})
            clear_output(wait=True)
            print(current_id, "/", len(urls_all), current_url, "id:", id_from_article_url(current_url), "category:", category_from_article_url(current_url))
            print(f"processed: {len(data)} ({divnr} / {lenall} divs)")
    
    df = pd.DataFrame(data)
    df.to_csv(outputFile,index=False)
    driver.quit()
    print("finished processing", current_url)

1247 / 4914 https://www.nu.nl/coronavirus/6051794/ruim-vijf-miljoen-door-overheid-aangekochte-mondkapjes-afgekeurd.html id: 6051794 category: coronavirus
processed: 93 (93 / 366 divs)


KeyboardInterrupt: 

## code block 7 (target urls save)

In [None]:
URLFILE = "nunl-urls-20200726.csv"

nunl_urls = []
for current_id in range(0,len(urls_all)):
    current_url = urls_all[current_id]
    article_id = id_from_article_url(current_url)
    article_cat = category_from_article_url(current_url)
    
    outputFile = f"nunl-{article_cat}-{article_id}-{current_id}.csv"
    nunl_urls.append((current_url,outputFile))

pd.DataFrame(nunl_urls).to_csv(URLFILE)
df = pd.read_csv(URLFILE,index_col=0)
df.iloc[:10]

In [None]:
"economie-6066366-0.csv" in list(df["1"])

## Test code

In [None]:
if True:
    # wait for spinner to finish
    while len(driver.find_elements_by_xpath("//div[starts-with(@class, 'Spinner__container')]")) != 0:
        time.sleep(1)
    
    # locate all reactions and store in csv file
    print("locate all reactions")  
    data = []
    alldivs = driver.find_elements_by_xpath("//div")
    divnr = 0
    lenall = len(alldivs)
    for e in alldivs:
        divnr += 1
        eClass = e.get_attribute(CLASS)
        print(eClass)
        if re.search(COMMENT,eClass):
            print("comment found")
            eId = e.find_elements_by_xpath(XPATHID)[0].id
            parent = e.find_elements_by_xpath(XPATHPARENTID)[0].id
            authorName = ""
            timeStamp = ""
            text = ""
            for f in e.find_elements_by_xpath(".//*"):
                fClass = f.get_attribute(CLASS)
                if fClass != eClass:
                    if re.search(AUTHORNAME,fClass): 
                        authorName = f.text
                    elif re.search(TIMESTAMP,fClass): 
                        timeStamp = f.get_attribute(TITLE)
                    elif re.search(TEXT,fClass): 
                        text = re.sub("\n"," ",f.text)
                        break
            data.append({"id":eId,"name":authorName,"date":timeStamp,"text":text,"parent":parent})
            clear_output(wait=True)
            print(current_id, "/", len(urls_all), current_url, "id:", id_from_article_url(current_url), "category:", category_from_article_url(current_url))
            print(f"processed: {len(data)} ({divnr} / {lenall} divs)")
    
    df = pd.DataFrame(data)
    df.to_csv(outputFile,index=False)
    #driver.quit()
    print("finished processing", current_url)

In [None]:
print("test")

In [None]:
if True:
    current_url = urls_all[current_id]
    
    print(current_id, "/", len(urls_all)-1, current_url, "id:", id_from_article_url(current_url), "category:", category_from_article_url(current_url))
    
    article_id = id_from_article_url(current_url)
    article_cat = category_from_article_url(current_url)
    
    outputFile = f"{article_cat}-{article_id}-{current_id}.csv"
    
    webPage = talk_url_from_id(article_id)
    
    driver = webdriver.Firefox(executable_path=DRIVER)
    driver.get(webPage)
    driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")
