In [106]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import re
from random import sample
import logging
from ftlangdetect import detect  
# fasttext is the fastest & most accurate library for language detection, but requires manual downloading of a pre-trained model; 
# this library is a wrapper of fasttext and gets rid of the need of it

In [24]:
def set_chrome_driver():
    chrome_options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

In [None]:
pragmaticblog_url = 'https://edit.tosdr.org//services/1452/annotate'
pragmaticblog_html = scraper.get_html_source(pragmaticblog_url)
scraper.scrape_documents_per_service('pragmaticblog', documents_html=pragmaticblog_html, url=pragmaticblog_url, out_file_name="./tosdr_pragmaticblog.jsonl")

2022-11-21 03:58:15,661 - [scrape_documents_per_service] - No document for (service: pragmaticblog)! https://edit.tosdr.org//services/1452/annotate


In [111]:
class ToSDRScraper():
    def __init__(self, out_file="./tosdr.jsonl", log_file='./test.log'):
        self.driver = set_chrome_driver()
        self.out_file = out_file
        self.write_f = open(self.out_file, 'w')
        self.base_url = 'https://edit.tosdr.org'
        
        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - [%(funcName)s] - %(message)s',
            handlers=[logging.FileHandler(log_file), logging.StreamHandler()]
        )
        self.logger = logging.getLogger('tosdr_logger')
        
        
    def login(self, email='shparksue@gmail.com', password='2022-2NLPproject'):
        """Activates authenticated session"""
        self.driver.get('https://edit.tosdr.org/users/sign_in')
        
        self.driver.find_element(By.ID, 'user_email').send_keys(email)
        self.driver.find_element(By.ID, 'user_password').send_keys(password)
        self.driver.find_element(By.XPATH, '//*[@id="new_user"]/div[2]/input').click()
    
    
    def get_html_source(self, url: str, timeout=0):
        """Get the HTML source to directly instantiate a new BeautifulSoup object (possibly for debugging purposes)"""
        self.driver.get(url)
        if timeout:
            element = WebDriverWait(self.driver, timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, 'table'))
            )
        return self.driver.page_source
    
    
    def scrape_services(self):
        """Get urls for the annotated documents of each service and process each document"""
        services_html = self.get_html_source(url='https://edit.tosdr.org/services', timeout=10)  # takes some time to load the full page
        services_soup = BeautifulSoup(services_html, 'html.parser')
        table = services_soup.select_one('table.table.table-striped')
        all_services = table.find_all('tr', {'data-classification': ['A', 'B', 'C', 'D', 'E']})
        print("Total number of services:", len(all_services))
        
        for row in tqdm(all_services):
            columns = row.find_all('td')
            service = columns[1].text.strip()
            url = self.base_url + columns[4].find('a', href=True)['href']
            documents_html = self.get_html_source(url)
            self.scrape_documents_per_service(service, url, documents_html)
            
            
    def scrape_documents_per_service(self, service_name, url, documents_html, out_file_name=None):
        """Scrape each ToS document of each service and write the data into .jsonl format"""
        if out_file_name:  # for DEBUG
            self.out_file = out_file_name
            self.write_f = open(self.out_file, 'w')
            
        documents_soup = BeautifulSoup(documents_html, 'html.parser')
        documents = documents_soup.select('div.panel.panel-default')
        if not documents:
            self.logger.info(f"No document for (service: {service_name})! {url}")
            return
        
        for document_elements in documents:
            document_data = {'service': service_name, 'url': url}
            document_data = self._parse_document(service_name, url, document_data, document_elements)
            if not document_data:
                continue
            self.write_f.write(json.dumps(document_data) + '\n')
            
    
    def _parse_document(self, service_name, url, document_data, document_elements):
        """Parse each document return a dictionary of the structured information"""
        title = document_elements.select_one('h3').text
        original_text, summary = self._generate_document_data(document_elements)
        if not original_text:
            self.logger.info(f"No content inside (service: {service_name}, document: {title})! {url}")
            return None
        if not summary:
            self.logger.info(f"No annotation for (service: {service_name}, document: {title})! {url}")
            return None
        if not self._detect_english(original_text):
            self.logger.info(f"Not English (service: {service_name}, document: {title})! {url}")
            return None
        
        return document_data | {'document_title': title,
                'original_text_length': len(original_text),
                'summary_length': len(summary),
                'original_text': original_text, 
                'summary': summary}  # merge dictionary
        
        
    def _generate_document_data(self, document_elements):
        """Iterate through the sections and divide the parsed sections of the documents into the original text and summary"""
        # NOTE: the first or last sentence in the annotated section, and the sentences before and after it can contain incomplete phrases
        full_doc = []
        summary = []
        ptr = document_elements.select_one('.panel-body.documentContent > p')
        for section in ptr.next_siblings:  # iterate through the contents
            if section.text.strip():  # tag content is not empty
                sentences = self._parse_section(section.text)
                if section.select('a'):  # is hyperlinked, i.e., annotated by users
                    summary.extend(sentences)
                full_doc.extend(sentences)
        return full_doc, summary
    
    
    def _parse_section(self, text):
        """Break down each section into sentences"""
        sentences = []
        for segments in re.split('<.+?>', text):
            sentences.extend(segment.strip() for segment in segments.split('\n') if segment.strip())
        return sentences
    
    def _detect_english(self, original_text):
        """Returns True if the text is in English, otherwise False"""
        # NOTE: At first try, it takes about 30s to download the pre-trained FastText model
        result = set()
        sentences = sample(original_text, 3)
        for sentence in sentences:
            result.add(detect(text=sentence)['lang'])
        return True if 'en' in result else False

In [112]:
scraper = ToSDRScraper()

2022-11-21 04:23:20,817 - [log] - Get LATEST chromedriver version for google-chrome 107.0.5304
2022-11-21 04:23:21,168 - [log] - Driver [/Users/suepark/.wdm/drivers/chromedriver/mac_arm64/107.0.5304/chromedriver] found in cache


In [113]:
scraper.login()

Multiple documents & multiple annotations

In [83]:
spotify_url = 'https://edit.tosdr.org//services/225/annotate'
spotify_html = scraper.get_html_source(spotify_url)
scraper.scrape_documents_per_service('Spotify', documents_html=spotify_html, url=spotify_url, out_file_name="./tosdr_spotify.jsonl")

In [84]:
instagram_url = 'https://edit.tosdr.org//services/219/annotate'
instagram_html = scraper.get_html_source(instagram_url)
scraper.scrape_documents_per_service('Instagram', documents_html=instagram_html, url=instagram_url, out_file_name="./tosdr_instagram.jsonl")

Non-English

In [85]:
seenthis_url = 'https://edit.tosdr.org//services/330/annotate'
seenthis_html = scraper.get_html_source(seenthis_url)
scraper.scrape_documents_per_service('SeenThis', documents_html=seenthis_html, url=seenthis_url, out_file_name="./tosdr_seenthis.jsonl")

2022-11-21 03:58:11,448 - [_parse_document] - Not English (service: SeenThis, document: Intellectual Property)! https://edit.tosdr.org//services/330/annotate


0 document

In [86]:
pragmaticblog_url = 'https://edit.tosdr.org//services/1452/annotate'
pragmaticblog_html = scraper.get_html_source(pragmaticblog_url)
scraper.scrape_documents_per_service('pragmaticblog', documents_html=pragmaticblog_html, url=pragmaticblog_url, out_file_name="./tosdr_pragmaticblog.jsonl")

2022-11-21 03:58:15,661 - [scrape_documents_per_service] - No document for (service: pragmaticblog)! https://edit.tosdr.org//services/1452/annotate


1 Document but 0 content

In [87]:
gnome_url = 'https://edit.tosdr.org//services/2781/annotate'
gnome_html = scraper.get_html_source(gnome_url)
scraper.scrape_documents_per_service('gnome', documents_html=gnome_html, url=gnome_url, out_file_name="./tosdr_gnome.jsonl")

2022-11-21 03:58:18,597 - [_parse_document] - No content inside (service: gnome, document: Code of Conduct)! https://edit.tosdr.org//services/2781/annotate


Document with 0 annotation

In [88]:
musicbrainz_url = 'https://edit.tosdr.org//services/736/annotate'
musicbrainz_html = scraper.get_html_source(musicbrainz_url)
scraper.scrape_documents_per_service('musicbrainz', documents_html=musicbrainz_html, url=musicbrainz_url, out_file_name="./tosdr_musicbrainz.jsonl")

2022-11-21 03:58:22,607 - [_parse_document] - No annotation for (service: musicbrainz, document: Code of Conduct)! https://edit.tosdr.org//services/736/annotate


In [114]:
scraper.scrape_services()

Total number of services: 1255


  0%|          | 3/1255 [00:08<58:17,  2.79s/it]2022-11-21 04:23:58,102 - [_parse_document] - No annotation for (service: Flickr, document: Flickr Privacy Policy)! https://edit.tosdr.org/services/186/annotate
2022-11-21 04:23:58,104 - [_parse_document] - No annotation for (service: Flickr, document: Community Guidelines)! https://edit.tosdr.org/services/186/annotate
2022-11-21 04:23:58,110 - [_parse_document] - No annotation for (service: Flickr, document: Data Processing Policy - Date Uncertain, Sometime during or after 2017)! https://edit.tosdr.org/services/186/annotate
2022-11-21 04:23:58,131 - [_parse_document] - No annotation for (service: Flickr, document: Community Guidelines - April 30th 2020)! https://edit.tosdr.org/services/186/annotate
  0%|          | 5/1255 [00:13<56:24,  2.71s/it]  2022-11-21 04:24:03,487 - [_parse_document] - No annotation for (service: Goguardian, document: Product Terms)! https://edit.tosdr.org/services/1625/annotate
2022-11-21 04:24:03,491 - [_parse_d

KeyboardInterrupt: 