# Imports

The required packages to run this notebook.  
If any are missing use ``!{sys.executable} -m pip install <package>`` to install the package in the notebook environment

In [17]:
import os.path
import gc 
import pandas as pd
import csv
import sys
from ctypes import cdll, CDLL
from urllib import parse
#!{sys.executable} -m pip install beautifulsoup4
from bs4 import BeautifulSoup
#!{sys.executable} -m pip install nbimporter
import nbimporter
import ARCH_Helper as ah

# Download an archive
The script below makes a data folder with a subfolder for the dataset. Dataset option are "World", "German" or "Dutch", select these by altering the ``dataset`` variable. The script wil skip files that are already present on the system. If you want to redownload the files first remove the old files manually.  
If the download fails due to connection errors, rerun the script and will will notice that he file did not decompress and try again.  
Note: The html-file-information.csv can be very large and take several hours and fail. Consider putting the code below code in a loop if you want to run it over night, if the download fails it will try again and if successfull it will say it already has the desired file.

In [18]:
# Dataset option are "World", "German" or "Dutch"
Dataset = "Dutch"

!mkdir data
!mkdir data/{Dataset}
!mkdir data/{Dataset}/results

if Dataset == "German":
    if not os.path.exists("data/German/domain-frequency.csv"):
        print("domain-frequency.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/DomainFrequencyExtraction/domain-frequency.csv.gz?access=VRQ4COI5RFEB6XTJZNQTBRLEZTTHJERL" --output data/German/domain-frequency.csv.gz
        print("domain-frequency.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/German/domain-graph.csv"):
        print("domain-graph.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/DomainGraphExtraction/domain-graph.csv.gz?access=JKEPGQ6MUC72JQB23IXOC4KOLGJYDSMN" --output data/German/domain-graph.csv.gz
        print("domain-graph.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/German/css-file-information.csv"):
        print("css-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/css-file-information.csv.gz?access=I2WP4REJA3NOBU3TCAAL3OIGJKNXM46R" --output data/German/css-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("css-file-information.csv already exists")

    if not os.path.exists("data/German/js-file-information.csv"):
        print("js-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/js-file-information.csv.gz?access=M3QSMFPLEHPZPWZIFSMZ6CT2OO7WYQ4M" --output data/German/js-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("js-file-information.csv already exists")
        
    if not os.path.exists("data/German/html-file-information.csv"):
        print("html-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/html-file-information.csv.gz?access=E3BUHKL5P4Q3TD4LGXZV2AOMERYQ3GWL" --output data/German/html-file-information.csv.gz
        print("html-file-information.csv has been downloaded")
    else:
        print("html-file-information.csv already exists")
        
if Dataset == "World":
    if not os.path.exists("data/World/domain-frequency.csv"):
        print("domain-frequency.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/DomainFrequencyExtraction/domain-frequency.csv.gz?access=SMSQY3G6IGKGRWVLGCWMA7DMCHBCKQ4K" --output data/World/domain-frequency.csv.gz
        print("domain-frequency.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/World/domain-graph.csv"):
        print("domain-graph.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/DomainGraphExtraction/domain-graph.csv.gz?sample=true&access=BZTA7LW5LUNMPGKWQMLKAUAXBV2E2AEC" --output data/World/domain-graph.csv.gz
        print("domain-graph.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/World/css-file-information.csv"):
        print("css-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/css-file-information.csv.gz?access=UZ4PWVRXXWPF53BHM7TTYNK24P7YAIXQ" --output data/World/css-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("css-file-information.csv already exists")

    if not os.path.exists("data/World/js-file-information.csv"):
        print("js-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/js-file-information.csv.gz?access=54X7IV7QOOAJWGHKPRGDI7HIR6W6GKPI" --output data/World/js-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("js-file-information.csv already exists")
    
    if not os.path.exists("data/World/html-file-information.csv"):
        print("html-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/html-file-information.csv.gz?access=JQEDT3PRXOS6OXZ7LA5EC55HQJVUASDX" --output data/World/html-file-information.csv.gz
        print("html-file-information.csv has been downloaded")
    else:
        print("html-file-information.csv already exists")

if Dataset == "Dutch":
    if not os.path.exists("data/Dutch/domain-frequency.csv"):
        print("domain-frequency.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/DomainFrequencyExtraction/domain-frequency.csv.gz?access=QUQTRHVNIDKXN62S4D62XFHCFI7VDSH7" --output data/Dutch/domain-frequency.csv.gz
        print("domain-frequency.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/Dutch/domain-graph.csv"):
        print("domain-graph.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/DomainGraphExtraction/domain-graph.csv.gz?access=JUNESOMDTBYDNCCGVJI5MJHW45KHAIJZ" --output data/Dutch/domain-graph.csv.gz
        print("domain-graph.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/Dutch/css-file-information.csv"):
        print("css-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/TextFilesInformationExtraction/css-file-information.csv.gz?access=IGBKSGXJBL3L4IJ2LXS5NPJTOY77JCT5" --output data/Dutch/css-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("css-file-information.csv already exists")

    if not os.path.exists("data/Dutch/js-file-information.csv"):
        print("js-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/TextFilesInformationExtraction/js-file-information.csv.gz?access=RPZKGWUTWJBIRKOKKHGECB7W4OHUATMO" --output data/Dutch/js-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("js-file-information.csv already exists")
    
    if not os.path.exists("data/Dutch/html-file-information.csv"):
        print("html-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/TextFilesInformationExtraction/html-file-information.csv.gz?access=52ZOSNQQMFWKW42WMQPNMGKYLEVUQHUC" --output data/Dutch/html-file-information.csv.gz
        print("html-file-information.csv has been downloaded")
    else:
        print("html-file-information.csv already exists")

unzip = "find data/" +Dataset +" -name '*.gz' -exec gunzip {} \;"
!{unzip}

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘data/Dutch’: File exists
mkdir: cannot create directory ‘data/Dutch/results’: File exists
domain-frequency.csv already exists
domain-frequency.csv already exists
css-file-information.csv already exists
js-file-information.csv already exists
html-file-information.csv already exists


# Extract wepages holding signs of commenting systems in their HTML

``detection_patterns`` is a csv file holding the commenting systems and their detection patterns

The ``chunksize`` variable determines how much memory is used. A larger chunksize allows for faster computation, but will crash when RAM is full. If the computation crashes try restarting the kernel and/or a lower chunksize. A chunksize of 1000 should be fine for most machines.

In [19]:
dataset_location = "data/"+Dataset +"/html-file-information.csv"
detection_patterns = pd.read_csv("Commenting-system-detection-patterns.csv")
chunksize = 10000

The code bellow loops through the known commenting systems and their detection patterns, saving them per commenting system in a CSV file at ``"data/"+Dataset +"/results/"+system+".csv"``

In [20]:
for system in list(detection_patterns['Commenting system'].unique()):
    system_df_list = []
    for index, row in detection_patterns[detection_patterns['Commenting system'] == system].iterrows():
        system_df = ah.filter_csv_content_regex(dataset_location, row['Regex'], chunksize)
        system_df[row['Snipit']]=True
        system_df_list.append(system_df)

    system_df = pd.concat(system_df_list)
    
    system_df["year"] = system_df["crawl_date"].div(10000).round().astype(int)
    system_df["domain"] = system_df['url'].apply(ah.to_domain)
    
    system_df = system_df.fillna(False)
    
    
    snippits = list(detection_patterns[detection_patterns['Commenting system'] == system]['Snipit'])
    result_scope = ['year','domain']
    result_scope.extend(snippits)
    
    system_results = pd.DataFrame(system_df[result_scope].value_counts().sort_index()).reset_index()
    system_results.rename(columns={0:'count'}, inplace=True)
    
    save_location = "data/"+Dataset +"/results/"+system+".csv"
    ah.save_dataframe(system_results, save_location)
    #print(system_results)

Display summeries for each of the commenting systems

In [29]:
for system in list(detection_patterns['Commenting system'].unique()):
    save_location = "data/"+Dataset +"/results/"+system+".csv"
    results = pd.read_csv(save_location)
    domains = list(results.domain.unique())
    
    #skip to next itteration if no domains found with system
    if domains == []:
        continue

    identifiers = results.columns.values.tolist()
    identifiers = [x for x in identifiers if x not in ['year','domain','count']]
    print("Overlap between identifiers for "+system+":")
    print()
    print(results[identifiers].value_counts())
    print()
    
    print()
    for domain in domains:
        years = list(results[results.domain == domain].year.unique())
        print(domain, "uses ", system," in:", years)
    
    print()
    print("Number of sites using "+system+" per year:")
    sites_per_year = pd.DataFrame(results[["year", "domain"]].drop_duplicates()["year"].value_counts().sort_index()).reset_index()
    sites_per_year.columns = ["year", "sites using "+system ]
    
    print(sites_per_year)
    print()

Overlap between identifiers for Disqus:

data-disqus-identifier  disqus.com/embed.js  disqus_thread
False                   False                True             173
                        True                 False             84
True                    False                False             58
dtype: int64


csmonitor.com uses  Disqus  in: [2010]
english.aljazeera.net uses  Disqus  in: [2011]
haarlemsdagblad.nl uses  Disqus  in: [2011, 2012, 2013, 2014]
independent.co.uk uses  Disqus  in: [2011]
nrc.nl uses  Disqus  in: [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2019]
telegraph.co.uk uses  Disqus  in: [2011, 2012, 2014, 2015, 2017, 2018, 2019]
vn.nl uses  Disqus  in: [2011, 2012, 2013, 2014, 2015]
weblogs.nrc.nl uses  Disqus  in: [2011, 2012, 2013]
wired.com uses  Disqus  in: [2011, 2012, 2013]
aboutblank.nl uses  Disqus  in: [2012]
aljazeera.com uses  Disqus  in: [2012]
blogs.computerworld.com uses  Disqus  in: [2012, 2013]
boingboing.net uses  Disqus  in: [2012]
business.inquirer.

In [None]:
for col in results.columns:
    if col.isin(['year','domain','count']):
        continue
    print(col)

In [22]:
identifiers = results.columns.values.tolist()
identifiers = [x for x in identifiers if x not in ['year','domain','count']]
identifiers

['commento.js']

In [27]:
results = pd.read_csv("data/Dutch/results/Facebook Comments.csv")
identifiers = results.columns.values.tolist()
identifiers = [x for x in identifiers if x not in ['year','domain','count']]
print(results[identifiers].value_counts())

fbComments  fb-comments  fb:comments
False       True         False          144
            False        True            28
True        False        False            8
dtype: int64


# Extract wepages holding signs for the Disqus commenting system in their HTML

The code bellow looks through a dataset for any indicators for the presence of the disqus commenting system. The input for ```filter_csv_content_regex``` is the desired dataset file, the regular expression and the chunksize.

In [4]:
disqus_embed_df = ah.filter_csv_content_regex(dataset_location, "(?i)disqus\.com\/embed\.js", chunksize)
disqus_embed_df['disqus_embed']=True
print("Embed hits: ", len(disqus_embed_df))

Embed hits:  2314


In [5]:
disqus_identifier_df = ah.filter_csv_content_regex(dataset_location, "(?i)data-disqus-identifier", chunksize)
disqus_identifier_df['disqus_identifier']=True
print("Identifier hits: ", len(disqus_identifier_df))

Identifier hits:  2220


In [6]:
disqus_thread = ah.filter_csv_content_regex(dataset_location, "(?i)disqus_thread", chunksize)
disqus_thread['disqus_thread']=True
print("thread hits: ", len(disqus_thread))

thread hits:  4952


Only use domains within the research scope

In [7]:
if (Dataset == "World"):
    domains_in_scope = ["https://cnn.com","https://www.nytimes.com/","https://www.theguardian.com/","https://www.indiatimes.com/","https://www.foxnews.com/","https://www.washingtonpost.com/","https://usatoday.com","https://www.cnbc.com","http://www.chinadaily.com.cn/","https://www.thehindu.com/","https://www.wsj.com","https://nypost.com/","https://www.huffingtonpost.com","https://abcnews.go.com/","https://www.bbc.com/","https://www.dailymail.co.uk/","https://timesofindia.indiatimes.com/","https://www.independent.co.uk/","https://www.smh.com.au/","https://www.telegraph.co.uk/","https://www.latimes.com/","https://www.sfgate.com/","https://www.nbcnews.com/","https://buzzfeednews.com","https://aljazeera.com","https://rt.com","https://reuters.com","https://npr.org","https://sputniknews.com","https://cbsnews.com","https://cbc.ca","https://abc.net.au","https://time.com","https://mirror.co.uk","https://thesun.co.uk","https://euronews.com","https://ctvnews.ca","https://vox.com","https://scmp.com","https://france24.com","https://express.co.uk","https://news.sky.com","https://news24.com","https://globalnews.ca","https://channelnewsasia.com","https://rawstory.com","https://washingtontimes.com","https://todayonline.com","https://dailytelegraph.com.au","https://csmonitor.com"]
elif (Dataset == "German"):
    domains_in_scope = ["https://www.bild.de","https://www.waz.de","https://www.zeit.de/","https://www.sueddeutsche.de","https://www.welt.de/","https://rp-online.de/","https://www.faz.net/","https://www.augsburger-allgemeine.de/","https://www.allgaeuer-zeitung.de/","https://www.freiepresse.de","https://www.merkur.de","https://www.lvz.de","https://www.mz.de","https://www.wn.de","https://www.hna.de","https://www.rheinpfalz.de","https://www.volksstimme.de","https://www.handelsblatt.com","https://taz.de","https://www.nd-aktuell.de","https://jungefreiheit.de","https://www.freitag.de","http://www.das-parlament.de","https://www.juedische-allgemeine.de","https://www.fr.de","https://www.abendblatt.de","https://www.morgenpost.de","https://www.tagesspiegel.de","https://www.bz-berlin.de","https://www.ftd.de","https://www.ksta.de/","https://www.stuttgarter-zeitung.de","https://www.badische-zeitung.de","https://www.tz.de","https://www.mopo.de","https://www.berliner-zeitung.de","https://www.haz.de","https://www.stuttgarter-nachrichten.de","http://www.spiegel.de/","https://www.deutschlandfunk.de/","https://www.focus.de/","https://www.stern.de/","https://www.zdf.de/","https://wdr.de/","https://www.ndr.de/","https://www.br.de/","https://www.swr.de/","https://www.mainpost.de/","https://www.nwzonline.de/","https://mads.de/"]
elif (Dataset == "Dutch"):
    domains_in_scope = ["https://fd.nl/","https://www.ad.nl","https://www.nd.nl/","https://www.nrc.nl/","https://www.rd.nl/","https://www.telegraaf.nl/","https://www.trouw.nl","https://www.volkskrant.nl/","https://www.ewmagazine.nl/","https://www.bd.nl","https://www.bndestem.nl","https://www.destentor.nl","https://www.dvhn.nl","https://www.ed.nl","https://www.gelderlander.nl","https://www.haarlemsdagblad.nl/","https://www.lc.nl/","https://www.leidschdagblad.nl/","https://www.limburger.nl","https://www.noordhollandsdagblad.nl","https://www.parool.nl","https://www.pzc.nl","https://www.tubantia.nl","https://jeugdjournaal.nl/","https://nieuws.nl/","https://nos.nl/","https://www.1limburg.nl/","https://www.businessinsider.nl/","https://www.dutchnews.nl/","https://www.ewmagazine.nl/","https://www.geenstijl.nl/","https://www.kliknieuws.nl/","https://www.metronieuws.nl/","https://www.nhnieuws.nl/","https://www.ninefornews.nl/","https://www.nu.nl/","https://www.quotenet.nl","https://www.rtlnieuws.nl","https://www.tpo.nl","https://www.welingelichtekringen.nl","https://www.hartvannederland.nl/","https://www.bnr.nl/","https://www.dagelijksestandaard.nl/","http://www.joop.nl/","https://www.hpdetijd.nl/","https://www.groene.nl/","https://www.oneworld.nl/","https://www.vn.nl/","https://sargasso.nl/","https://www.powned.tv/"]

domains_in_scope = list(map(ah.to_domain, domains_in_scope))
disqus_embed_df = ah.filter_dataframe_domains(disqus_embed_df, domains_in_scope).reset_index(drop=True)
disqus_identifier_df = ah.filter_dataframe_domains(disqus_identifier_df, domains_in_scope).reset_index(drop=True)
disqus_thread = ah.filter_dataframe_domains(disqus_thread, domains_in_scope).reset_index(drop=True)

In [8]:
print("Embed hits: ", len(disqus_embed_df))
print("Identifier hits: ", len(disqus_identifier_df))
print("thread hits: ", len(disqus_thread))

Embed hits:  2223
Identifier hits:  2186
thread hits:  4742


Merge individual searches

In [9]:
disqus_df = disqus_embed_df.merge(disqus_identifier_df, on=["crawl_date","url","filename","extension","mime_type_web_server","mime_type_tika","md5", "sha1","content"], how='outer')
disqus_df = disqus_df.merge(disqus_thread, on=["crawl_date","url","filename","extension","mime_type_web_server","mime_type_tika","md5", "sha1","content"], how='outer')
disqus_df

Unnamed: 0,crawl_date,url,filename,extension,mime_type_web_server,mime_type_tika,md5,sha1,content,disqus_embed,disqus_identifier,disqus_thread
0,20120206,http://www.noordhollandsdagblad.nl/nieuws/econ...,Beurzen-omlaag-door-zorgen-eurocrisis,xhtml,text/html,application/xhtml+xml,b9b10e9eca24f0329de1d7cb4c3fffa8,12f313ba32f370f412c0c4c7617b06afe1f9d223,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",True,True,True
1,20120204,http://www.noordhollandsdagblad.nl/nieuws/econ...,Beurzen-omlaag-door-zorgen-eurocrisis,xhtml,text/html,application/xhtml+xml,188d13258e259435e1b2750a02dda384,6749339a84c9186f595e30270b6a5ead3e10ee57,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",True,True,True
2,20120205,http://www.noordhollandsdagblad.nl/nieuws/spor...,Wozniacki-in-tranen-door-polsblessure,xhtml,text/html,application/xhtml+xml,f261eca98ba0940997a6baaaf47a08ad,3d171a3453aeea3cf9e1805c2636e914be9f8771,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",True,True,True
3,20120202,http://www.noordhollandsdagblad.nl/nieuws/spor...,Wozniacki-in-tranen-door-polsblessure,xhtml,text/html,application/xhtml+xml,c36adfe393ce032d0811983d58bfb182,b9d9291aaa684a928ae6d1f00dc80dfe6e6c8bf4,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",True,True,True
4,20120205,http://www.noordhollandsdagblad.nl/nieuws/spor...,Turnsters-grijpen-naast-teamticket-Spelen,xhtml,text/html,application/xhtml+xml,087c4585080ab94be031d1db5301c02d,7ff02aba73b2e97ff3cd25ac189089f06b69976e,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
4738,20120225,http://www.vn.nl/boeken/schrijver/de-giftige-w...,,html,text/html,text/html,361938b309a683eb920f4cbce3537595,1b5fbe72d6931a04351dca483944385fcdf8017e,<!DOCTYPE html>\n\n<!--[if lt IE 7 ]> <html cl...,,,True
4739,20130311,http://www.vn.nl/boeken/schrijver/ramsey-nasr-...,,html,text/html,text/html,38e6efdf68f6736a3c5fdbd69620ba52,5450e586db039893a8baf7759758335e156ddab1,<!DOCTYPE html>\n\n<!--[if lt IE 7 ]> <html cl...,,,True
4740,20120512,http://www.vn.nl/boeken/theater-2/actrice-anni...,,html,text/html,text/html,eac516c29ce5b2023d5c0601c9c572c1,6ba7b047ab7578a7a533a63691e819e1fe952d27,<!DOCTYPE html>\n\n<!--[if lt IE 7 ]> <html cl...,,,True
4741,20120512,http://www.vn.nl/boeken/theater-2/wij-zijn-all...,,html,text/html,text/html,8693694157e079bc8dd6daa6c03c2460,550d7f1232ba618de845f64ac6b5888f142f41fa,<!DOCTYPE html>\n\n<!--[if lt IE 7 ]> <html cl...,,,True


Fill in the NaN hits caused by the merging of the results.

In [10]:
disqus_df = disqus_df.fillna(False)

Count the pages with Disqus in their HTML

In [11]:
len(disqus_df)

4743

Note that this finds more hits than grep. It seems the csv file is too large for grep. the same query in awk does give the expected number of results

```grep -i -n 'disqus\.js' C/Users/rjans/Desktop/ARCH/Data/disqus.csv```  
vs  
```awk -e '/disqus\.js/ {print $0}' C/Users/rjans/Desktop/ARCH/Data/html-file-information-German.csv```

Show the unique domains

In [12]:
print(disqus_df['url'].apply(ah.to_domain).unique())

['noordhollandsdagblad.nl' 'nos.nl' 'dutchnews.nl' 'leidschdagblad.nl'
 'nieuws.nl' 'haarlemsdagblad.nl' 'hpdetijd.nl' 'vn.nl'
 'welingelichtekringen.nl' 'quotenet.nl' 'oneworld.nl' 'ninefornews.nl'
 'nrc.nl' 'sargasso.nl']


# Display results

Add year and domain collumns

In [13]:
disqus_df["year"] = disqus_df["crawl_date"].div(10000).round().astype(int)
disqus_df["domain"] = disqus_df['url'].apply(ah.to_domain)

Display value_counts as dataframe

In [14]:
#pd.set_option('display.max_rows', 500)
results = pd.DataFrame(disqus_df[['year','domain','disqus_embed','disqus_identifier','disqus_thread']].value_counts().sort_index()).reset_index()
results.rename(columns={0:'count'}, inplace=True)
results

Unnamed: 0,year,domain,disqus_embed,disqus_identifier,disqus_thread,count
0,2011,haarlemsdagblad.nl,False,True,True,53
1,2011,nrc.nl,False,False,True,6
2,2011,vn.nl,False,False,True,14
3,2011,vn.nl,False,True,True,11
4,2011,vn.nl,True,False,True,17
...,...,...,...,...,...,...
103,2019,nrc.nl,False,False,True,5
104,2019,welingelichtekringen.nl,False,True,True,188
105,2019,welingelichtekringen.nl,True,True,True,1
106,2020,nieuws.nl,False,False,True,59


In [15]:
disqus_embed_hits = results[results.disqus_embed == True]
disqus_identifier_hits = results[results.disqus_identifier == True]
disqus_thread_hits = results[results.disqus_thread == True]

disqus_embed_and_identifier_hits = ah.dataframe_intersection(disqus_embed_hits,disqus_identifier_hits)
disqus_embed_and_thread_hits = ah.dataframe_intersection(disqus_embed_hits,disqus_thread_hits)
disqus_identifier_and_thread_hits = ah.dataframe_intersection(disqus_identifier_hits,disqus_identifier_hits)

print("disqus_embed_hits: ", len(disqus_embed_hits))
print("disqus_identifier_hits: ", len(disqus_identifier_hits))
print("disqus_thread_hits: ", len(disqus_thread_hits))

print("disqus_embed_and_identifier_hits: ", len(disqus_embed_and_identifier_hits))
print("disqus_embed_and_thread_hits: ", len(disqus_embed_and_thread_hits))
print("disqus_identifier_and_thread_hits: ", len(disqus_identifier_and_thread_hits))

disqus_embed_hits:  45
disqus_identifier_hits:  54
disqus_thread_hits:  107
disqus_embed_and_identifier_hits:  25
disqus_embed_and_thread_hits:  45
disqus_identifier_and_thread_hits:  54


Display presence of Disqus per domain per year

In [16]:
domains = list(results.domain.unique())

for domain in domains:
    years = list(results[results.domain == domain].year.unique())
    print(domain, "uses Disqus is in:", years)

haarlemsdagblad.nl uses Disqus is in: [2011, 2012, 2013, 2014]
nrc.nl uses Disqus is in: [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2019]
vn.nl uses Disqus is in: [2011, 2012, 2013, 2014, 2015]
leidschdagblad.nl uses Disqus is in: [2012, 2013, 2014, 2015]
noordhollandsdagblad.nl uses Disqus is in: [2012, 2013, 2014, 2015, 2016]
nos.nl uses Disqus is in: [2012, 2013, 2014]
dutchnews.nl uses Disqus is in: [2013, 2014, 2015, 2016, 2017, 2019]
hpdetijd.nl uses Disqus is in: [2013, 2014, 2015, 2016]
quotenet.nl uses Disqus is in: [2013, 2014, 2015, 2016, 2017]
welingelichtekringen.nl uses Disqus is in: [2013, 2014, 2015, 2016, 2017, 2018, 2019]
nieuws.nl uses Disqus is in: [2014, 2016, 2017, 2018, 2019, 2020, 2021]
oneworld.nl uses Disqus is in: [2014, 2015, 2016, 2017]
sargasso.nl uses Disqus is in: [2017]
ninefornews.nl uses Disqus is in: [2019]


Display number of uniqe sites per year that use Disqus

In [17]:
sites_per_year = pd.DataFrame(results[["year", "domain"]].drop_duplicates()["year"].value_counts().sort_index()).reset_index()
sites_per_year.columns = ["year", "sites"]
sites_per_year

Unnamed: 0,year,sites
0,2011,3
1,2012,6
2,2013,10
3,2014,12
4,2015,9
5,2016,8
6,2017,7
7,2018,2
8,2019,5
9,2020,1


Save the results in a seperate CSV (only needs to be done once)

In [18]:
#disqus_df.to_csv("disqus.csv")

Convert **content** column to single line.

In [19]:
for line in disqus_df.content:
    disqus_df.content = disqus_df.content.replace(line, " ".join(line.splitlines()))

Save as single line content in a seperate CSV (only needs to be done once)

In [20]:
#disqus_df.to_csv("disqus_single_line.csv")

# For the found pages extract the comment structure

Extract the disqus comments from the div tag with ```id="disqus_thread"```

In [21]:
comments = []
for line in disqus_df.content:
    soup = BeautifulSoup(line, 'html.parser')
    try:
        comments.append(soup.find('div', attrs={'id': 'disqus_thread'}).prettify()) # .prettify() makes the HTML of the comments more human readable
    except AttributeError:
        comments.append("")
        
disqus_df['comments'] = comments

In [22]:
#pd.set_option('display.max_colwidth', 500) 
disqus_df[['crawl_date','url','comments']]

Unnamed: 0,crawl_date,url,comments
0,20120206,http://www.noordhollandsdagblad.nl/nieuws/econ...,"<div id=""disqus_thread"">\n</div>\n"
1,20120204,http://www.noordhollandsdagblad.nl/nieuws/econ...,"<div id=""disqus_thread"">\n</div>\n"
2,20120205,http://www.noordhollandsdagblad.nl/nieuws/spor...,"<div id=""disqus_thread"">\n</div>\n"
3,20120202,http://www.noordhollandsdagblad.nl/nieuws/spor...,"<div id=""disqus_thread"">\n</div>\n"
4,20120205,http://www.noordhollandsdagblad.nl/nieuws/spor...,"<div id=""disqus_thread"">\n</div>\n"
...,...,...,...
4738,20120225,http://www.vn.nl/boeken/schrijver/de-giftige-w...,"<div id=""disqus_thread"">\n</div>\n"
4739,20130311,http://www.vn.nl/boeken/schrijver/ramsey-nasr-...,"<div id=""disqus_thread"">\n <div id=""dsq-conten..."
4740,20120512,http://www.vn.nl/boeken/theater-2/actrice-anni...,"<div id=""disqus_thread"">\n</div>\n"
4741,20120512,http://www.vn.nl/boeken/theater-2/wij-zijn-all...,"<div id=""disqus_thread"">\n</div>\n"


# Add an internet archive URL to the dataframe

In [23]:
ah.add_ia_url(disqus_df)

Filter out comment fields that are so short that they do net contain comments (based on the length).  
32 characters is currently the shortest 

In [24]:
#pd.set_option('display.max_colwidth', 50)
disqus_df[disqus_df['comments'].apply(lambda x: len(x)>32)][['comments','IA_url']]

Unnamed: 0,comments,IA_url
983,"<div id=""disqus_thread"">\n <div id=""dsq-conten...",https://web.archive.org/web/20150110/http://ww...
984,"<div id=""disqus_thread"">\n <div id=""dsq-conten...",https://web.archive.org/web/20150111/http://ww...
985,"<div id=""disqus_thread"">\n <div id=""dsq-conten...",https://web.archive.org/web/20160202/http://ww...
986,"<div id=""disqus_thread"">\n <div id=""dsq-conten...",https://web.archive.org/web/20150112/http://ww...
987,"<div id=""disqus_thread"">\n <div id=""dsq-conten...",https://web.archive.org/web/20160914/http://ww...
...,...,...
4646,"<div id=""disqus_thread"">\n <noscript>\n <p>\n...",https://web.archive.org/web/20151011/http://ww...
4714,"<div id=""disqus_thread"">\n <div id=""dsq-conten...",https://web.archive.org/web/20120512/http://ww...
4716,"<div id=""disqus_thread"">\n <div id=""dsq-conten...",https://web.archive.org/web/20120420/http://ww...
4723,"<div id=""disqus_thread"">\n <div id=""dsq-conten...",https://web.archive.org/web/20120420/http://ww...


Display a specific comment

In [25]:
ah.display_comment(disqus_df, 0)

<div id="disqus_thread">
</div>



# Comparing dataframes

Creates a second dataframe with a different regex query 

In [26]:
#disqus_df_2 = ah.filter_csv_content_regex("data/World/html-file-information.csv", "(?i)disqus\.js", 10000)

Show the overlap of two dataframes

In [27]:
#ah.dataframe_intersection(disqus_df[['crawl_date','url']],disqus_df_2[['crawl_date','url']])

Show the difference between the two datasets. So only shows results that are in one dataset but not in both.

In [28]:
#ah.dataframe_difference(disqus_df[['crawl_date','url']],disqus_df_2[['crawl_date','url']])

# Testing scratchbook

Reloads the ARCH Helper module after update.  
Normally this would be done with autoreload Ipython magic, but for some reason it does not work. 

In [29]:
import importlib
importlib.reload(ah)
import ARCH_Helper as ah