# NU.nl comment harvester

You need the Python library selenium (pip install selenium) and the geckodriver: https://github.com/mozilla/geckodriver/releases

Instructions:

1. point your web browser to a nu.nl news article
2. ~~open the comment section by clicking on "number reacties" below the article~~
3. ~~right click on a comment, choose "This frame" and then "Show only this frame" (tested in Firefox browser)~~
4. ~~a new web page opens: get the address of this webpage~~
5. ~~add the web page address in first code block below in a new line WEBPAGE="..." below the other ones~~
6. copy the url into the variable `current_url` in the first code block
7. run the first code block
8. a new window open with the comments
9. open all comments by searching for "meer reacties" and clicking on the buttons
10. run the second code block and wait (this takes a lot of time)
11. the comments are stored in a file `[category]-[article_id].csv`

Collect list of relevant URLs:

1. go to https://www.nu.nl/tag/Coronavirus
2. manually click "Laad meer artikelen" until the desired time period is shown
3. copy-paste from browser window to LibreOffice Writer
4. save as Flat ODT xml file (.fodt)
5. `grep -Eo "https://www.nu.nl/[A-Za-z\-]+/[0-9]+/" [copied file].fodt|sort|uniq > [url file].txt`

wrt 3 and 4: this can also be done with Microsoft Word, open the `docx` file in a file extraction program (e.g., WinZip) and locate the relevant `.xml` file.

In [None]:
from selenium import webdriver
import re

def id_from_article_url(article_url):
    res = re.search(r"([0-9]+)/$", article_url)
    return res.group(1)

def category_from_article_url(article_url):
    res = re.search(r"^https://www.nu.nl/([^\/]+)/", article_url)
    return res.group(1)

def talk_url_from_id(article_id):
    return "https://talk.nu.nl/embed/stream?asset_url=https%3A%2F%2Fwww.nu.nl%2Fartikel%2F"+str(article_id)+"%2Fredirect.html&initialWidth=601&childId=coral_talk_wrapper"

DRIVER = "/usr/local/bin/geckodriver"

URLFILE = "urls_may_week1_uniq.txt"
urls_from_file = open(URLFILE, encoding="utf8")
urls_all = urls_from_file.read().splitlines()
current_id = 12

current_url = urls_all[current_id]
# or add manually here
# current_url = "https://www.nu.nl/[...]"

print(current_url, "id:", id_from_article_url(current_url), "category:", category_from_article_url(current_url))

article_id = id_from_article_url(current_url)
article_cat = category_from_article_url(current_url)
      
WEBPAGE = talk_url_from_id(article_id)

driver = webdriver.Firefox(executable_path=DRIVER)
driver.get(WEBPAGE)
driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")

In [None]:
from IPython.display import clear_output
import pandas as pd
import re

COMMENT = r"^Comment__commentContainer"
AUTHORNAME = r"^AuthorName__name"
TIMESTAMP = r"^CommentTimestamp__timestamp"
TEXT = r"^Comment__content"
CLASS = "class"
TITLE = "title"
XPATHID = "../../.."
XPATHPARENTID = "../../../../.."
OUTPUTFILE = f"{article_cat}-{article_id}.csv"

data = []
for e in driver.find_elements_by_xpath("//div"):
    eClass = e.get_attribute(CLASS)
    if re.search(COMMENT,eClass):
        eId = e.find_elements_by_xpath(XPATHID)[0].id
        parent = e.find_elements_by_xpath(XPATHPARENTID)[0].id
        authorName = ""
        timeStamp = ""
        text = ""
        for f in e.find_elements_by_xpath(".//*"):
            fClass = f.get_attribute(CLASS)
            if fClass != eClass:
                if re.search(AUTHORNAME,fClass): 
                    authorName = f.text
                elif re.search(TIMESTAMP,fClass): 
                    timeStamp = f.get_attribute(TITLE)
                elif re.search(TEXT,fClass): 
                    text = re.sub("\n"," ",f.text)
                    break
        data.append({"id":eId,"name":authorName,"date":timeStamp,"text":text,"parent":parent})
        clear_output(wait=True)
        print("processed:",len(data))

df = pd.DataFrame(data)
df.to_csv(OUTPUTFILE,index=False)
driver.quit()