In [23]:
import requests
import schedule
import smtplib
import pickle
import time
import tqdm
import glob
import ssl
import os
import re

import pandas as pd
import numpy  as np

from fake_useragent import UserAgent
from collections    import defaultdict
from statistics     import mean
from torrequest     import TorRequest
from random         import randint
from bs4            import BeautifulSoup as bs

# Introduction

We have a few goals with this notebook:
1. Scrape a website using BeautifulSoup
2. Use Tor to avoid IP detection
3. Randomly switch user agents to avoid detection
4. Schedule scraping to happen automatically every day
5. Email ourselves with information about scraping

The website that we'll be scraping can be found [here](http://www.vdc-sy.info/index.php/en/martyrs). They provide their information with in a dataframe-like structure, but unfortunately it doesn't display all the information they have per entry. If we click on a name we are brought to another page displaying more detailed information on the entry. We are lucky that the data we want to scrape from this site can be accessed directly by modifying the end of the url with a number up to around 250,000. 

# Libraries

Before anything, we'll need to install Tor and configure it.

In [None]:
!pip install tor

After installing Tor, we'll want to change our password associated with the Port that Tor uses. To do this, we just enter tor --hash-password

Make sure to copy the hashed password!

Now we need to make some changes to our Tor configuration file. In Mac OS, it can be found in usr/etc/tor/torrc

We have to uncomment lines 57, 60, and 61.

In the HashedControlPassword field, paste the hashed password you just got earlier.

Torrequests is the library that we'll use in order to send requests over the Tor Network. It is a very simple wrapper for the regular requests library.

In [None]:
!pip install torrequest

Finally we can install fake-useragent, which will let us cycle through new useragents.

In [None]:
!pip install fake_useragent

# Scraping

First we'll write a couple helper functions that make it easier to save files locally. This will let us resume progress if any errors kill our script. These files will be saved to the local directory as pickle files.

In [24]:
def save(obj, name):
    pickle.dump(obj, open(name + '.pickle', 'wb'))

def load(name):
    return pickle.load(open(name + '.pickle', 'rb'))

Now for the scraping. This will be relatively easy. We will get the page using a TorRequests to mask our IP, navigate to the data we need using BeautifulSoup, create a dataframe from that data, and save it to a file.

The function scrape_recent() will handle the scraping for the most recent ids. It does this by finding all links that match a regex pattern and then checking these against a list of all previous ids. We'll use the next function to handle the rest.

The function scrape_details() will handle scraping the pages on the website that contain the detailed information of an entry. This represents **one** person. Each person is saved and returned as a dataframe, since a person will have varying amounts and types of labels/columns and will have to be combined into one large dataset later. 

In [25]:
def scrape_recent():
    first_page = 'http://www.vdc-sy.info/index.php/en/martyrs/1/c29ydGJ5PWEua2lsbGVkX2RhdGV8c29ydGRpcj1ERVNDfGFwcHJvdmVkPXZpc2libGV8ZXh0cmFkaXNwbGF5PTB8'
    pattern    = re.compile('\/index\.php\/en\/details\/martyrs\/.')

    ua         = UserAgent()
    headers    = {'User-Agent': ua.random}
    tor        = TorRequest(password = 'commonhorse')


    try:
        response = tor.get(first_page, headers=headers)
        content  = bs(response.text, 'html.parser')
        
        links    = {link['href'][30:] for link in content.find_all('a', href = True) if pattern.match(link['href'])} 

    except Exception as e:
        print(e)

    return links

In [27]:
def scrape_details(uid, tor, headers):
    cols = []
    vals = []

    url  = 'http://www.vdc-sy.info/index.php/en/details/martyrs/' + uid
    
    # Headers will provide the UserAgent to use when getting response
    # Makes the request using a TorRequest object passed in
    page = tor.get(url, headers = headers).text
    page = bs(page, 'html.parser')
    
    table = page.find('table', attrs = {'class':'peopleListing'})
    rows  = table.find_all('tr')

    for row in rows:
        data = row.find_all('td')

        # All data without only 2 data values
        # are not data we are looking for
        if len(data) != 2:
            continue

        # data[0] corresponds to the row label/column
        cols.append(data[0].text)
        
        # Values need to appended differently for image rows 
        if data[1].find('img') is not None:
            vals.append(data[1].find('img')['src'])
        else:
            vals.append(data[1].text)

    # Adds the uid to the dataframe
    cols.append('uid')
    vals.append(uid)

    # Creates and saves dataframe
    person = pd.DataFrame([vals], columns = cols, dtype=str)
#     print(person.head())
    save(person, os.path.join('leftovers', uid))

Now we can prepare the function to send an email. We'll have to change our Google Account settings first. We'll then have to turn on the 'Less secure app access' in the Security tab. This will allow us to log in and send emails through an account using python.

Both the sll and smtplib librariers are part of standard Python.

In [28]:
"""
Adapted from https://realpython.com/python-send-email/
"""
import ssl
import smtplib

def send_email():
    # We must use this port for ssl
    port     = 465
    password = get_password()

    sender_email   = "romanlosul@gmail.com"  
    receiver_email = "rsul@ucsc.edu"  

    message = """\
    Subject: VDC Scrape Log

    Scraping was a success.
    """

    context = ssl.create_default_context()
    with smtplib.SMTP_SSL("smtp.gmail.com", port, context=context) as server:
        server.login(sender_email, password)
        server.sendmail(sender_email, receiver_email, message)

In [None]:
# # init_uids()
# uids_to_scrape = load('uids_to_scrape')
# uids_scraped   = load('uids_scraped')

Using Tor without changing the our user agent is useless, since we can still be identified by our User Agent.

We will use the [fake-useragent](https://github.com/hellysmile/fake-useragent) library to cycle through random user agents. There are issues with other approaches using User Agents that are out of date or uncommon, but this library User Agent selection is based on usage statistics from [http://useragentstring.com/](http://useragentstring.com/).

Our process flow can be described as:
    1. Get a random UserAgent
    2. Create TorRequest instance
    3. Remove uid from Queue
    4. Scrape details with uid
    5. Save progress
    5. Repeat 3-5 until Queue is empty

In [6]:
def job():
    
    # This is slightly modified from the UserAgent docs
    # It creates a UserAgent Object, and assigns
    # a random UserAgent to a header dict
    
    ua         = UserAgent()
    headers    = {'User-Agent': ua.random}
    
    # Setting up Torrequest
    # Followed instructions from 
    # https://www.scrapehero.com/make-anonymous-requests-using-tor-python/
    
    # This password is what we used earlier when setting up Tor
    # TorRequest gives us a new IP address
    tor = TorRequest(password = 'commonhorse')
    
    # reset_identity() should reset our IP Address, but 
    # it currently has a bug. We will keep it here for when 
    # TorRequests is updated. In the mean time, instantiate 
    # a new TorRequest object to get a new IP address.
    tor.reset_identity()
    
    # We will load in the uids that we've already scraped
    # and scrape the first page to get recent ids
    scraped_uids   = load('scraped_uids')
    recent_uids    = scrape_recent()
    uids_to_scrape = []
    
    # We'll add any ids that we haven't already seen
    # And then scrape the detailed pages
    for uid in recent_uids:
        if uid not in scraped_uids:
            uids_to_scrape.append(uid)
    
    # We pop a uid off our list of uids to scrape and
    # attempt to scrape it the detailed page. If it fails, 
    # then it adds it back to the queue and resets the UserAgent
    # and Tor identity.
    #
    # If it succeeds then it adds the uid to the list of scraped uids 
    # and saves that file to load in later
    
    while len(uids_to_scrape) > 0:
        uid = uids_to_scrape.pop()

        try:
            scrape_details(uid, tor, headers)

        except Exception as e:
            print(e)
            uids_to_scrape.add(uid)
            
            ua         = UserAgent()
            headers    = {'User-Agent': ua.random}
            tor = TorRequest(password = 'commonhorse')
            tor.reset_identity()
        
            continue

        print('Left to scrape: ', len(uids_to_scrape))
        scraped_uids.add(uid)
        save(scraped_uids, 'scraped_uids')
        
    send_email()
        

We can now schedule our scraping to happen at 12 AM everyday, so long as this notebook is up and running.

In [15]:
schedule.every().day.at("17:53").do(job)

while True:
    schedule.run_pending()

KeyboardInterrupt: 

In [13]:
def get_password():
    return 'dipping1_mitzvoth'

In [29]:
helen___uids_to_scrape = load('helen___uids_to_scrape')
katelyn_uids_to_scrape = load('katelyn_uids_to_scrape')
matthew_uids_to_scrape = load('matthew_uids_to_scrape')
roman___uids_to_scrape = load('roman___uids_to_scrape')

helen___uids_scraped   = load('helen___uids_scraped')
katelyn_uids_scraped   = load('katelyn_uids_scraped')
matthew_uids_scraped   = load('matthew_uids_scraped')
roman___uids_scraped   = load('roman___uids_scraped')

# Helen

In [34]:
ua         = UserAgent()
headers    = {'User-Agent': ua.random}
tor        = TorRequest(password = 'commonhorse')
tor.reset_identity()

helen___uids_scraped   = load('helen___uids_scraped')
helen___uids_to_scrape = load('helen___uids_to_scrape')

while len(helen___uids_to_scrape) > 0:
    uid = helen___uids_to_scrape.pop()

    try:
        scrape_details(uid, tor, headers)

    except Exception as e:
        print(e)
        helen___uids_to_scrape.append(uid)

        ua         = UserAgent()
        headers    = {'User-Agent': ua.random}
        tor        = TorRequest(password = 'commonhorse')
        tor.reset_identity()

        continue

    print('Left to scrape: ', len(helen___uids_to_scrape))
    helen___uids_scraped.add(uid)

    save(helen___uids_to_scrape, 'helen___uids_to_scrape')
    save(helen___uids_scraped  , 'helen___uids_scraped')

Left to scrape:  8047
Left to scrape:  8046
Left to scrape:  8045
Left to scrape:  8044
Left to scrape:  8043
Left to scrape:  8042
Left to scrape:  8041
Left to scrape:  8040
Left to scrape:  8039
Left to scrape:  8038
Left to scrape:  8037
Left to scrape:  8036
Left to scrape:  8035
Left to scrape:  8034
Left to scrape:  8033
Left to scrape:  8032
Left to scrape:  8031
Left to scrape:  8030
Left to scrape:  8029
Left to scrape:  8028
Left to scrape:  8027
Left to scrape:  8026
Left to scrape:  8025
Left to scrape:  8024
Left to scrape:  8023
Left to scrape:  8022
Left to scrape:  8021
Left to scrape:  8020
Left to scrape:  8019
Left to scrape:  8018
Left to scrape:  8017
Left to scrape:  8016
Left to scrape:  8015
Left to scrape:  8014
Left to scrape:  8013
Left to scrape:  8012
Left to scrape:  8011
Left to scrape:  8010
Left to scrape:  8009
Left to scrape:  8008
Left to scrape:  8007
Left to scrape:  8006
Left to scrape:  8005
Left to scrape:  8004
Left to scrape:  8003
Left to sc

KeyboardInterrupt: 

# Katelyn

list

# Matthew

In [None]:
ua         = UserAgent()
headers    = {'User-Agent': ua.random}
tor        = TorRequest(password = '')
tor.reset_identity()

matthew_uids_scraped   = load('matthew_uids_scraped')
matthew_uids_to_scrape = load('matthew_uids_to_scrape')

while len(matthew_uids_to_scrape) > 0:
    uid = matthew_uids_to_scrape.pop()

    try:
        scrape_details(uid, tor, headers)

    except Exception as e:
        print(e)
        matthew_uids_to_scrape.append(uid)

        ua         = UserAgent()
        headers    = {'User-Agent': ua.random}
        tor        = TorRequest(password = '')
        tor.reset_identity()

        continue

    print('Left to scrape: ', len(matthew_uids_to_scrape))
    matthew_uids_scraped.add(uid)

    save(matthew_uids_to_scrape, 'matthew_uids_to_scrape')
    save(matthew_uids_scraped  , 'matthew_uids_scraped')