In [90]:
import os
import pycurl
from io import BytesIO 
import certifi 
import difflib
import sys, traceback
import stem.process
from stem.util import term

In [56]:
def query(url, istor, SOCKS_PORT = 7000):
  """
  Uses pycurl to fetch a site
  """
  output = BytesIO()

  query = pycurl.Curl()
  query.setopt(pycurl.CAINFO, certifi.where())
  query.setopt(pycurl.FOLLOWLOCATION, 1)
  
  if istor:
        #setup proxy
        query.setopt(pycurl.PROXY, 'localhost')
        query.setopt(pycurl.PROXYPORT, SOCKS_PORT)
        query.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5_HOSTNAME)
        
  query.setopt(pycurl.URL, url)
  query.setopt(pycurl.WRITEFUNCTION, output.write)
  
  try:
    query.perform()
    return output.getvalue()
  except pycurl.error as exc:
    return "Unable to reach %s (%s)" % (url, exc)


In [57]:
def print_bootstrap_lines(line):
  if "Bootstrapped " in line:
    print(line)

In [108]:
SOCKS_PORT = 7000
print(term.format("Starting Tor:\n", term.Attr.BOLD))

tor_process = stem.process.launch_tor_with_config(
  config = {
    'SocksPort': str(SOCKS_PORT),
    'ExitNodes': '{se}',
  },
  init_msg_handler = print_bootstrap_lines,
)

allurls = open('alexa100.csv','r')
try:
    ip = query("https://www.atagar.com/echo.php",True)
    if len(ip)>0:
        if isinstance(ip, bytes):
            ip = ip.decode('utf-8')
        ip = ip.split(':')[1].split('(')[0]
    savefile = 'se-'+ip+'/files/'
    if not os.path.exists(savefile):
        os.makedirs(savefile)
        

    for aurl in allurls:
        aurl = aurl.replace('\n','')
        print(aurl)
        page_tor = query(aurl, True)
        page_regular = query(aurl, False)

        if isinstance(page_tor, bytes):
            page_tor = page_tor.decode('ISO-8859-1')
        if isinstance(page_regular, bytes):
            page_regular = page_regular.decode('ISO-8859-1')
        
        out = open(savefile + aurl.split('.')[1]+'_tor.html','w')
        out.write(page_tor)
        out.close()
        
        out = open(savefile + aurl.split('.')[1]+'.html','w')
        out.write(page_regular)
        out.close()
        
        diff = difflib.ndiff(page_tor.splitlines(1), page_regular.splitlines(1))
        out = open(savefile + aurl.split('.')[1]+'_diff.html','w')
        out.write(''.join(diff))
        out.close()
except Exception as e:
    exc_type, exc_value, exc_traceback = sys.exc_info()
    traceback.print_exc()
    
finally:
    tor_process.kill()  # stops tor

[1mStarting Tor:
[0m
Jun 18 01:50:03.000 [notice] Bootstrapped 0%: Starting
Jun 18 01:50:03.000 [notice] Bootstrapped 80%: Connecting to the Tor network
Jun 18 01:50:04.000 [notice] Bootstrapped 85%: Finishing handshake with first hop
Jun 18 01:50:05.000 [notice] Bootstrapped 90%: Establishing a Tor circuit
Jun 18 01:50:06.000 [notice] Bootstrapped 100%: Done
Uses pycurl to fetch a site
http://www.google.com
Uses pycurl to fetch a site
Uses pycurl to fetch a site
http://www.facebook.com
Uses pycurl to fetch a site
Uses pycurl to fetch a site
http://www.youtube.com
Uses pycurl to fetch a site
Uses pycurl to fetch a site
http://www.yahoo.com
Uses pycurl to fetch a site
Uses pycurl to fetch a site
http://www.baidu.com
Uses pycurl to fetch a site
Uses pycurl to fetch a site
http://www.wikipedia.org
Uses pycurl to fetch a site
Uses pycurl to fetch a site
http://www.qq.com
Uses pycurl to fetch a site
Uses pycurl to fetch a site
http://www.linkedin.com
Uses pycurl to fetch a site
Uses pycur

KeyboardInterrupt: 