In [1]:
from spark_uid import *

### Choose crawl to load

In [2]:
crawl = 3
openwpm_log_dir = '/Users/sammacbeth/work/OpenWPM/logs/'
crawl_output_dir = './data/crawls/{}'.format(crawl)

### Import crawl data or analyse from loads if it has not yet been analysed

In [4]:
import os
import json
from ast import literal_eval

if all(map(lambda path: os.path.exists(crawl_output_dir +'/'+ path), ['requests', 'uid_reach'])):
    requests = sc.textFile('{}/requests/'.format(crawl_output_dir)).map(json.loads)
    uid_reach = sc.textFile('{}/uid_reach/'.format(crawl_output_dir)).map(literal_eval).cache()
else:
    requests, uid_reach = run_analysis(sc, input_dir='{}/{}/'.format(openwpm_log_dir, crawl))
    save_analysis_rdds(requests, uid_reach, crawl_output_dir)


In [5]:
# get urls from crawl
import sqlite3
import pandas as pd
conn = sqlite3.connect('{}/crawl-data.sqlite'.format(openwpm_log_dir))
crawl_urls = pd.read_sql("SELECT site_url FROM site_visits WHERE crawl_id = {}".format(crawl), conn)
crawl_urls

Unnamed: 0,site_url
0,http://ebay.de
1,http://ebay-kleinanzeigen.de
2,http://google.de
3,http://youtube.com
4,http://amazon.de
5,http://bild.de
6,http://t-online.de
7,http://suchen.mobile.de
8,http://web.de
9,http://jappy.de


## Make UID Tables

In [6]:
uid_tables = uid_reach_as_dataframes(uid_reach, sqlContext)
register_tables(uid_tables)

In [7]:
query_top_uids(sqlContext).limit(20).toPandas()

Unnamed: 0,uid_id,domain,duration,non_fp_uniques,tp_domains,uids,cookies,qs,ps
0,0,doubleclick.net,2.383779,847,238,2,2,0,0
1,1,doubleclick.net,2.251869,769,219,3,3,0,0
2,2,doubleclick.net,1.953368,638,185,2,2,0,0
3,3,doubleclick.net,1.944625,627,184,3,3,0,0
4,4,google.com,2.35039,670,138,1,1,0,0
5,7,adnxs.com,2.370054,295,112,1,1,0,0
6,9,adition.com,2.341146,285,109,1,0,0,0
7,10,adnxs.com,2.348995,290,109,1,0,0,0
8,15,facebook.com,2.206429,356,108,1,1,0,0
9,16,facebook.com,2.204574,356,108,2,2,0,0


In [13]:
from IPython.display import display
check_id = 0
display(query_uid_id(uid_tables, check_id).toPandas())
query_uid_id(uid_tables, check_id, from_table='non_fp_uniques').limit(10).toPandas()

Unnamed: 0,classification,domain,key,source,uid_id,value
0,uid,doubleclick.net,IDE,cookie,0,AHWqTUmtibDqZeS7vRUk7PMwSQcOMA9huBOxBNc1KwtCk1...
1,uid,doubleclick.net,id,cookie,0,2280ee95690e0078||t=1484152725|et=730|cs=00221...


Unnamed: 0,source,uid_id,url
0,referer_url,0,http://www.autobild.de/messen/auto-salon-paris/
1,referer,0,http://www.giga.de/apps/google-play-store/spec...
2,url,0,http://www.tvtoday.de/tv-programm
3,referer_url,0,http://rtb-csync.smartadserver.com/redir/?part...
4,url,0,http://www.ariva.de/
5,referer,0,https://aka-cdn.adtech.de/apps/397/Ad15458189S...
6,url,0,http://www.aboutyou.de/frauen/bekleidung/jeans
7,url,0,http://www.o2online.de/e-shop/apple/apple-ipho...
8,referer,0,http://tap2-cdn.rubiconproject.com/partner/scr...
9,referer,0,https://www.zalando.de/


## Make requests accessable to SparkSQL

In [9]:
request_tables = requests_as_dataframes(requests, sqlContext)
register_tables(request_tables)

In [12]:
sqlContext.sql("""\
SELECT * FROM request WHERE method = 'POST' LIMIT 10\
""").toPandas()

Unnamed: 0,host,method,path,port,res_status,rid,scheme,text,tld,ts,url
0,www.youtube.com,POST,/ad_data_204,443,204,872,https,dt=1484152843588&flash=24.0.0&frm=0&u_tz=60&u_...,youtube.com,1484153000.0,https://www.youtube.com/ad_data_204
1,www.youtube.com,POST,/ad_data_204,443,204,875,https,dt=1484152844754&flash=24.0.0&frm=1&u_tz=60&u_...,youtube.com,1484153000.0,https://www.youtube.com/ad_data_204
2,content.googleapis.com,POST,/youtubei/v1/log_interaction?alt=json&key=AIza...,443,200,895,https,"{""context"":{""client"":{""hl"":""en"",""gl"":""DE"",""cli...",googleapis.com,1484153000.0,https://content.googleapis.com/youtubei/v1/log...
3,www.youtube.com,POST,/annotations_invideo?cap_hist=1&video_id=xc08E...,443,200,905,https,,youtube.com,1484153000.0,https://www.youtube.com/annotations_invideo?ca...
4,www.youtube.com,POST,/get_endscreen?v=xc08ESKCg1k&client=1&ei=HWB2W...,443,200,907,https,session_token=QUFFLUhqa3c3S0xWZTBLREtwSkQ1VHJx...,youtube.com,1484153000.0,https://www.youtube.com/get_endscreen?v=xc08ES...
5,www.youtube.com,POST,/ad_data_204,443,204,945,https,dt=1484152843588&flash=24.0.0&frm=0&u_tz=60&u_...,youtube.com,1484153000.0,https://www.youtube.com/ad_data_204
6,www.youtube.com,POST,/ad_data_204,443,204,984,https,dt=1484152843588&flash=24.0.0&frm=0&u_tz=60&u_...,youtube.com,1484153000.0,https://www.youtube.com/ad_data_204
7,fls-eu.amazon.com,POST,/1/action-impressions/1/OP/csm/action/csm-feat...,443,200,1028,https,,amazon.com,1484153000.0,https://fls-eu.amazon.com/1/action-impressions...
8,www.amazon.de,POST,/ah/ajax/counter?ctr=desktop_ajax_atf&exp=1484...,443,202,1029,https,,amazon.de,1484153000.0,https://www.amazon.de/ah/ajax/counter?ctr=desk...
9,www.amazon.de,POST,/gp/overlay/display.html,443,200,1036,https,origSessionId=253-6706493-2004064&subPageType=...,amazon.de,1484153000.0,https://www.amazon.de/gp/overlay/display.html
