# Imports

The required packages to run this notebook.  
If any are missing use ``!{sys.executable} -m pip install <package>`` to install the package in the notebook environment

In [1]:
import os.path
import gc 
import pandas as pd
import csv
import sys
from ctypes import cdll, CDLL
from urllib import parse
#!{sys.executable} -m pip install beautifulsoup4
from bs4 import BeautifulSoup
#!{sys.executable} -m pip install nbimporter
import nbimporter
import ARCH_Helper as ah

# Download an archive
The script below makes a data folder with a subfolder for the dataset. Dataset option are "World", "German" or "Dutch", select these by altering the ``dataset`` variable. The script wil skip files that are already present on the system. If you want to redownload the files first remove the old files manually.  
If the download fails due to connection errors, rerun the script and will will notice that he file did not decompress and try again.  
Note: The html-file-information.csv can be very large and take several hours and fail. Consider putting the code below code in a loop if you want to run it over night, if the download fails it will try again and if successfull it will say it already has the desired file.

In [2]:
# Dataset option are "World", "German" or "Dutch"
Dataset = "World"

!mkdir data
!mkdir data/{Dataset}

if Dataset == "German":
    if not os.path.exists("data/German/domain-frequency.csv"):
        print("domain-frequency.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/DomainFrequencyExtraction/domain-frequency.csv.gz?access=VRQ4COI5RFEB6XTJZNQTBRLEZTTHJERL" --output data/German/domain-frequency.csv.gz
        print("domain-frequency.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/German/domain-graph.csv"):
        print("domain-graph.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/DomainGraphExtraction/domain-graph.csv.gz?access=JKEPGQ6MUC72JQB23IXOC4KOLGJYDSMN" --output data/German/domain-graph.csv.gz
        print("domain-graph.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/German/css-file-information.csv"):
        print("css-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/css-file-information.csv.gz?access=I2WP4REJA3NOBU3TCAAL3OIGJKNXM46R" --output data/German/css-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("css-file-information.csv already exists")

    if not os.path.exists("data/German/js-file-information.csv"):
        print("js-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/js-file-information.csv.gz?access=M3QSMFPLEHPZPWZIFSMZ6CT2OO7WYQ4M" --output data/German/js-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("js-file-information.csv already exists")
        
    if not os.path.exists("data/German/html-file-information.csv"):
        print("html-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-germanNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/html-file-information.csv.gz?access=E3BUHKL5P4Q3TD4LGXZV2AOMERYQ3GWL" --output data/German/html-file-information.csv.gz
        print("html-file-information.csv has been downloaded")
    else:
        print("html-file-information.csv already exists")
        
if Dataset == "World":
    if not os.path.exists("data/World/domain-frequency.csv"):
        print("domain-frequency.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/DomainFrequencyExtraction/domain-frequency.csv.gz?access=SMSQY3G6IGKGRWVLGCWMA7DMCHBCKQ4K" --output data/World/domain-frequency.csv.gz
        print("domain-frequency.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/World/domain-graph.csv"):
        print("domain-graph.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/DomainGraphExtraction/domain-graph.csv.gz?sample=true&access=BZTA7LW5LUNMPGKWQMLKAUAXBV2E2AEC" --output data/World/domain-graph.csv.gz
        print("domain-graph.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/World/css-file-information.csv"):
        print("css-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/css-file-information.csv.gz?access=UZ4PWVRXXWPF53BHM7TTYNK24P7YAIXQ" --output data/World/css-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("css-file-information.csv already exists")

    if not os.path.exists("data/World/js-file-information.csv"):
        print("js-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/js-file-information.csv.gz?access=54X7IV7QOOAJWGHKPRGDI7HIR6W6GKPI" --output data/World/js-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("js-file-information.csv already exists")
    
    if not os.path.exists("data/World/html-file-information.csv"):
        print("html-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-worldNews_1st-in-month_1-hop_unique-EXTRACTION-20210818232425/TextFilesInformationExtraction/html-file-information.csv.gz?access=JQEDT3PRXOS6OXZ7LA5EC55HQJVUASDX" --output data/World/html-file-information.csv.gz
        print("html-file-information.csv has been downloaded")
    else:
        print("html-file-information.csv already exists")

if Dataset == "Dutch":
    if not os.path.exists("data/Dutch/domain-frequency.csv"):
        print("domain-frequency.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/DomainFrequencyExtraction/domain-frequency.csv.gz?access=QUQTRHVNIDKXN62S4D62XFHCFI7VDSH7" --output data/Dutch/domain-frequency.csv.gz
        print("domain-frequency.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/Dutch/domain-graph.csv"):
        print("domain-graph.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/DomainGraphExtraction/domain-graph.csv.gz?access=JUNESOMDTBYDNCCGVJI5MJHW45KHAIJZ" --output data/Dutch/domain-graph.csv.gz
        print("domain-graph.csv has been downloaded")
    else:
        print("domain-frequency.csv already exists")

    if not os.path.exists("data/Dutch/css-file-information.csv"):
        print("css-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/TextFilesInformationExtraction/css-file-information.csv.gz?access=IGBKSGXJBL3L4IJ2LXS5NPJTOY77JCT5" --output data/Dutch/css-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("css-file-information.csv already exists")

    if not os.path.exists("data/Dutch/js-file-information.csv"):
        print("js-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/TextFilesInformationExtraction/js-file-information.csv.gz?access=RPZKGWUTWJBIRKOKKHGECB7W4OHUATMO" --output data/Dutch/js-file-information.csv.gz
        print("css-file-information.csv has been downloaded")
    else:
        print("js-file-information.csv already exists")
    
    if not os.path.exists("data/Dutch/html-file-information.csv"):
        print("html-file-information.csv is not present")
        !curl "https://webdata.archive-it.org/ait/arch:cohort.helmond/research_services/download/SPECIAL-dutchNews_1st-in-month_1-hop_unique-EXTRACTION-20210916172606/TextFilesInformationExtraction/html-file-information.csv.gz?access=52ZOSNQQMFWKW42WMQPNMGKYLEVUQHUC" --output data/Dutch/html-file-information.csv.gz
        print("html-file-information.csv has been downloaded")
    else:
        print("html-file-information.csv already exists")

unzip = "find data/" +Dataset +" -name '*.gz' -exec gunzip {} \;"
!{unzip}

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘data/World’: File exists
domain-frequency.csv already exists
domain-frequency.csv already exists
css-file-information.csv already exists
js-file-information.csv already exists
html-file-information.csv already exists


# Extract wepages holding signs for the Disqus commenting system in their HTML

The code bellow looks through a dataset for any indicators for the presence of the disqus commenting system. The input for ```filter_csv_content_regex``` is the desired dataset file, the regular expression and the chunksize.

In [3]:
# Options are "data/German/html-file-information.csv" "data/World/html-file-information.csv" and "data/Dutch/html-file-information.csv"
# or any other dataset on the system.
dataset = "data/World/html-file-information.csv"
chunksize = 10000

In [4]:
disqus_embed_df = ah.filter_csv_content_regex(dataset, "(?i)disqus\.com\/embed\.js", chunksize)
disqus_embed_df['disqus_embed']=True
print("Embed hits: ", len(disqus_embed_df))

Embed hits:  6840


In [5]:
disqus_identifier_df = ah.filter_csv_content_regex(dataset, "(?i)data-disqus-identifier", chunksize)
disqus_identifier_df['disqus_identifier']=True
print("Identifier hits: ", len(disqus_identifier_df))

Identifier hits:  10156


In [6]:
disqus_thread = ah.filter_csv_content_regex(dataset, "(?i)disqus_thread", chunksize)
disqus_thread['disqus_thread']=True
print("thread hits: ", len(disqus_thread))

thread hits:  21143


In [7]:
disqus_df = disqus_embed_df.merge(disqus_identifier_df, on=["crawl_date","url","filename","extension","mime_type_web_server","mime_type_tika","md5", "sha1","content"], how='outer')
disqus_df = disqus_df.merge(disqus_thread, on=["crawl_date","url","filename","extension","mime_type_web_server","mime_type_tika","md5", "sha1","content"], how='outer')
disqus_df

Unnamed: 0,crawl_date,url,filename,extension,mime_type_web_server,mime_type_tika,md5,sha1,content,disqus_embed,disqus_identifier,disqus_thread
0,20140502,http://www.aljazeera.com/news/middleeast/2014/...,deadly-airstrike-hits-aleppo-school-2014430154...,xhtml,text/html,application/xhtml+xml,97f75c11640780e79baa7cbb8607ef51,12388fe00bc982f7a6e31214ed2c571cbd638af3,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",True,,True
1,20140501,http://www.aljazeera.com/news/middleeast/2014/...,invasion-kassab-were-evicted-20144271355531702...,xhtml,text/html,application/xhtml+xml,0e2b97df26e0f9f031a26a98ebb8d92d,8a64ff502bd8f2bbe40d557ebffb89f8cc700fcb,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",True,,True
2,20140502,http://www.aljazeera.com/news/middleeast/2014/...,iraq-security-forces-ambush-syria-fuel-convoy-...,xhtml,text/html,application/xhtml+xml,faee2d99cf89e66433c825116183a493,eb61533d434ae9a654d0937ebdcb6ee2cb231e2a,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",True,,True
3,20140531,http://www.aljazeera.com/news/middleeast/2014/...,israel-suspends-peace-talks-with-palestinians-...,xhtml,text/html,application/xhtml+xml,a8d7c5f0e3770c4adc2b6f50c235376b,97580ae0e5cb309c05f07c8895285739b944db2c,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",True,,True
4,20150102,http://www.aljazeera.com/news/middleeast/2014/...,child-soldiers-recruited-syria-war-20141231204...,xhtml,text/html,application/xhtml+xml,035019bdf7658ac317902ccaf66af542,eee01070ccab6143b60e72b69fd246750b99763e,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",True,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
22736,20120104,http://www.independent.co.uk/news/media/online...,article6259894.ece,xhtml,text/html,application/xhtml+xml,7ff5bb0105edc9839419e1f0ecd916eb,dc7363e1eb19844a145895755b3efe46fdba06a2,"<!DOCTYPE html >\n<html xmlns=""http://www.w3.o...",,,True
22737,20120201,http://www.independent.co.uk/news/media/online...,free-trial-for-the-independents-new-ipad-app-6...,xhtml,text/html,application/xhtml+xml,94dddf8eeb231bb2422b781769817e52,0a23a4972a6692258f35ecdef9732822f9c76c52,"<!DOCTYPE html >\n<html xmlns=""http://www.w3.o...",,,True
22738,20110911,http://www.independent.co.uk/arts-entertainmen...,read-em-and-weep-the-literary-masters-of-miser...,xhtml,text/html,application/xhtml+xml,356648f2119ed7d0187b4bd2688d8734,b9ea600c69d7cf3b97329254afa255b5bbb9af9b,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",,,True
22739,20110409,http://www.independent.co.uk/environment/clima...,solution-to-the-carbon-problem-could-be-under-...,xhtml,text/html,application/xhtml+xml,bc3276e25be1ea7154626afce7216fb7,a5445dfe83e845d430af9720e9d6f50bce6c9e1d,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",,,True


Fill in the NaN hits caused by the merging of the results.

In [8]:
disqus_df = disqus_df.fillna(False)

Count the pages with Disqus in their HTML

In [9]:
len(disqus_df)

22741

Note that this finds more hits than grep. It seems the csv file is too large for grep. the same query in awk does give the expected number of results

```grep -i -n 'disqus\.js' C/Users/rjans/Desktop/ARCH/Data/disqus.csv```  
vs  
```awk -e '/disqus\.js/ {print $0}' C/Users/rjans/Desktop/ARCH/Data/html-file-information-German.csv```

Show the unique domains

In [10]:
print(disqus_df['url'].apply(ah.to_domain).unique())

['aljazeera.com' 'blogs.aljazeera.com' 'men.24.com' 'blogs.abcnews.com'
 'bloomberg.com' 'popwatch.ew.com' 'cnn.com' 'abcnews.go.com'
 'liveshots.blogs.foxnews.com' 'magazine.foxnews.com'
 'gretawire.foxnewsinsider.com' 'csmonitor.com' 'photos.essence.com'
 'myfoxchicago.com' 'myfoxla.com' 'myfoxmemphis.com' 'myfoxorlando.com'
 'mercurynews.com' 'spokesman.com' 'sfist.com' 'blogs.sfweekly.com'
 'tpmmuckraker.talkingpointsmemo.com' 'topgear.com' 'mirror.co.uk'
 'fashion.telegraph.co.uk' 'mweb.co.za' 'sagoodnews.co.za'
 'independent.co.uk' 'telegraph.co.uk' 'rawstory.com' 'pcmag.com'
 'politico.com' 'voices.news24.com' 'newser.com' 'newsmax.com'
 'english.aljazeera.net' 'lazygamer.net' 'washingtontimes.com'
 'communities.washingtontimes.com' 'propublica.org' 'scpr.org'
 'allthemoms.com' 'globalnews.ca' 'cnbc.com' 'blogs.kidspot.com.au'
 'video.cnbc.com' 'crooksandliars.com' 'politics.blogs.foxnews.com'
 'heatst.com' 'ew.com' 'jackandjillpolitics.com' 'thehill.com' 'tmz.com'
 'sports.usat

# Display results

Add year and domain collumns

In [11]:
disqus_df["year"] = disqus_df["crawl_date"].div(10000).round().astype(int)
disqus_df["domain"] = disqus_df['url'].apply(ah.to_domain)

Display value_counts as dataframe

In [12]:
#pd.set_option('display.max_rows', 500)
results = pd.DataFrame(disqus_df[['year','domain','disqus_embed','disqus_identifier','disqus_thread']].value_counts().sort_index()).reset_index()
results.rename(columns={0:'count'}, inplace=True)
results

Unnamed: 0,year,domain,disqus_embed,disqus_identifier,disqus_thread,count
0,2010,cnn.com,True,False,True,60
1,2010,csmonitor.com,True,False,True,1
2,2010,foxnews.com,False,False,True,3
3,2010,independent.co.uk,False,False,True,3
4,2010,oneclick.indiatimes.com,False,False,True,3
...,...,...,...,...,...,...
297,2020,tmz.com,False,False,True,3
298,2020,topgear.com,True,True,False,1
299,2021,abcnews.go.com,False,True,False,282
300,2021,mweb.co.za,True,True,True,1


In [13]:
disqus_embed_hits = results[results.disqus_embed == True]
disqus_identifier_hits = results[results.disqus_identifier == True]
disqus_thread_hits = results[results.disqus_thread == True]

disqus_embed_and_identifier_hits = ah.dataframe_intersection(disqus_embed_hits,disqus_identifier_hits)
disqus_embed_and_thread_hits = ah.dataframe_intersection(disqus_embed_hits,disqus_thread_hits)
disqus_identifier_and_thread_hits = ah.dataframe_intersection(disqus_identifier_hits,disqus_identifier_hits)

print("disqus_embed_hits: ", len(disqus_embed_hits))
print("disqus_identifier_hits: ", len(disqus_identifier_hits))
print("disqus_thread_hits: ", len(disqus_thread_hits))

print("disqus_embed_and_identifier_hits: ", len(disqus_embed_and_identifier_hits))
print("disqus_embed_and_thread_hits: ", len(disqus_embed_and_thread_hits))
print("disqus_identifier_and_thread_hits: ", len(disqus_identifier_and_thread_hits))

disqus_embed_hits:  114
disqus_identifier_hits:  114
disqus_thread_hits:  290
disqus_embed_and_identifier_hits:  40
disqus_embed_and_thread_hits:  113
disqus_identifier_and_thread_hits:  114


Save the results in a seperate CSV (only needs to be done once)

In [14]:
#disqus_df.to_csv("disqus.csv")

Convert **content** column to single line.

In [15]:
for line in disqus_df.content:
    disqus_df.content = disqus_df.content.replace(line, " ".join(line.splitlines()))

Save as single line content in a seperate CSV (only needs to be done once)

In [16]:
#disqus_df.to_csv("disqus_single_line.csv")

# For the found pages extract the comment structure

Extract the disqus comments from the div tag with ```id="disqus_thread"```

In [None]:
comments = []
for line in disqus_df.content:
    soup = BeautifulSoup(line, 'html.parser')
    try:
        comments.append(soup.find('div', attrs={'id': 'disqus_thread'}).prettify()) # .prettify() makes the HTML of the comments more human readable
    except AttributeError:
        comments.append("")
        
disqus_df['comments'] = comments

In [None]:
#pd.set_option('display.max_colwidth', 500) 
disqus_df[['crawl_date','url','comments']]

# Add an internet archive URL to the dataframe

In [None]:
ah.add_ia_url(disqus_df)

Filter out comment fields that are so short that they do net contain comments (based on the length).  
32 characters is currently the shortest 

In [None]:
#pd.set_option('display.max_colwidth', 50)
disqus_df[disqus_df['comments'].apply(lambda x: len(x)>32)][['comments','IA_url']]

Display a specific comment

In [None]:
ah.display_comment(disqus_df, 0)

# Comparing dataframes

Creates a second dataframe with a different regex query 

In [None]:
#disqus_df_2 = ah.filter_csv_content_regex("data/World/html-file-information.csv", "(?i)disqus\.js", 10000)

Show the overlap of two dataframes

In [None]:
#ah.dataframe_intersection(disqus_df[['crawl_date','url']],disqus_df_2[['crawl_date','url']])

Show the difference between the two datasets. So only shows results that are in one dataset but not in both.

In [None]:
#ah.dataframe_difference(disqus_df[['crawl_date','url']],disqus_df_2[['crawl_date','url']])

# Testing scratchbook

Reloads the ARCH Helper module after update.  
Normally this would be done with autoreload Ipython magic, but for some reason it does not work. 

In [None]:
import importlib
importlib.reload(ah)
import ARCH_Helper as ah

In [None]:
print(len(disqus_embed_df), len(disqus_identifier_df), len(disqus_thread))

In [None]:
ah.dataframe_intersection(disqus_embed_df[['crawl_date','url']],disqus_identifier_df[['crawl_date','url']])