In [29]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from pprint import pprint
from moztelemetry.spark import get_pings
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
from datetime import date, timedelta, datetime
from time import gmtime, strftime
from __future__ import division

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [30]:
sc.defaultParallelism

2015-05-14 22:37:31.257020


In [33]:
print "Start time = " + str(datetime.datetime.now())
yesterday = (date.today() - timedelta(1)).strftime("%Y%m%d")
pings = get_pings(sc, app="Firefox", channel="release", submission_date=yesterday, fraction=1)
print "End time = " + str(datetime.datetime.now())
print "Total pings = " + str(pings.count())

Start time = 2015-05-14 22:43:11.789990
End time = 2015-05-14 22:43:32.354984
Total pings = 5224188


What keys are in a ping?

In [34]:
ping = json.loads(pings.first())
ping.keys()

[u'keyedHistograms',
 u'info',
 u'slowSQL',
 u'ver',
 u'log',
 u'fileIOReports',
 u'histograms',
 u'lateWrites',
 u'clientID',
 u'addonDetails',
 u'addonHistograms',
 u'UIMeasurements',
 u'threadHangStats',
 u'simpleMeasurements',
 u'chromeHangs']

What's included in `addonDetails`?

In [35]:
ping['addonDetails']

{u'GMP': {u'gmp-gmpopenh264': {u'applyBackgroundUpdates': 1,
   u'userDisabled': False,
   u'version': u'1.3'}},
 u'XPI': {u'abs@avira.com': {u'location': u'app-profile', u'scan_MS': 50},
  u'anti_banner@kaspersky.com': {u'location': u'winreg-app-global',
   u'scan_MS': 28,
   u'scan_items': 22},
  u'avg@toolbar': {u'location': u'winreg-app-global',
   u'scan_MS': 56,
   u'scan_items': 151},
  u'content_blocker@kaspersky.com': {u'location': u'winreg-app-global',
   u'scan_MS': 0},
  u'online_banking@kaspersky.com': {u'location': u'winreg-app-global',
   u'scan_MS': 0},
  u'url_advisor@kaspersky.com': {u'location': u'winreg-app-global',
   u'scan_MS': 63},
  u'virtual_keyboard@kaspersky.com': {u'location': u'winreg-app-global',
   u'scan_MS': 0},
  u'{82AF8DCA-6DE9-405D-BD5E-43525BDAD38A}': {u'location': u'app-global',
   u'scan_MS': 0,
   u'scan_items': 1,
   u'shutdown_MS': 16,
   u'startup_MS': 69},
  u'{972ce4c6-7e08-4474-a285-3208198ce6fd}': {u'location': u'app-global',
   u'scan_M

We can get the list of enabled addons like so:

In [36]:
ping['addonDetails']['XPI']
ping['addonDetails']['XPI'].keys()

[u'{972ce4c6-7e08-4474-a285-3208198ce6fd}',
 u'content_blocker@kaspersky.com',
 u'abs@avira.com',
 u'avg@toolbar',
 u'{82AF8DCA-6DE9-405D-BD5E-43525BDAD38A}',
 u'virtual_keyboard@kaspersky.com',
 u'url_advisor@kaspersky.com',
 u'online_banking@kaspersky.com',
 u'anti_banner@kaspersky.com']

Let's filter down to those pings that actually have XPI addons.  What does one look like?

In [38]:
print "Start time = " + str(datetime.datetime.now())
subset = get_pings_properties(pings, ["clientID", "info/OS", "addonDetails/XPI"])
#print subset.count() #e.g. 5373921 total pings
subset = get_one_ping_per_client(subset)
#print subset.count() #e.g. 1072825 total pings
print "End time = " + str(datetime.datetime.now())

Start time = 2015-05-14 23:03:56.100689
End time = 2015-05-14 23:04:00.206966


In [39]:
cached = subset.cache()

Let's work with the following:

- Only check those pings with addon details
- Get a list of the addons per ping and tally them
- Combine tallies of all addons by key to get counts by addon


In [40]:
def hasAddons(ping):
    return "addonDetails/XPI" in ping

def getAddons(ping):
    return ping["addonDetails/XPI"].keys()

print "Start time = " + str(datetime.datetime.now())
addon_list = cached.filter(hasAddons).flatMap(getAddons).map(lambda x: (x,1)).reduceByKey(lambda x, y: x + y)
addon_counts = addon_list.map(lambda x:(x[1],x[0])).sortByKey(ascending=False)
addon_counts.take(5)
print "End time = " + str(datetime.datetime.now())

Start time = 2015-05-14 23:04:09.564036
End time = 2015-05-14 23:13:49.274058


In [None]:
#grouped = cached.map(lambda p: (p["info/OS"], p["simpleMeasurements/firstPaint"])).groupByKey().collectAsMap()

## What are the top 50 addons by count?

In [41]:
addon_counts.take(50)

[(1058502, u'{972ce4c6-7e08-4474-a285-3208198ce6fd}'),
 (265358, u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}'),
 (181120, u'{20a82645-c095-46ed-80e3-08825760534b}'),
 (163619, u'wrc@avast.com'),
 (143756, u'{82AF8DCA-6DE9-405D-BD5E-43525BDAD38A}'),
 (137588, u'{e4f94d1e-2f53-401e-8885-681602c0ddd8}'),
 (77745, u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}'),
 (60330, u'web2pdfextension@web2pdf.adobedotcom'),
 (52324, u'{2D3F3651-74B9-4795-BDEC-6DA2F431CB62}'),
 (49249, u'avg@toolbar'),
 (48761, u'mozilla_cc@internetdownloadmanager.com'),
 (43031, u'smartwebprinting@hp.com'),
 (38024, u'firebug@software.joehewitt.com'),
 (36273, u'abs@avira.com'),
 (36144, u'virtual_keyboard@kaspersky.com'),
 (36143, u'url_advisor@kaspersky.com'),
 (35521, u'content_blocker@kaspersky.com'),
 (32036, u'{73a6fe31-595d-460b-a920-fcc0f8843232}'),
 (31415, u'jqs@sun.com'),
 (31005, u'{DDC359D1-844A-42a7-9AA1-88A850A938A8}'),
 (26951, u'anti_banner@kaspersky.com'),
 (26923, u'online_banking@kaspersky.com'),
 (26218, 

## What are the top addons as a proportion of all submitted pings 

In [44]:
total = pings.count()
addon_props = addon_counts.map(lambda x: (x[0]/total, x[1]))
addon_props.take(50)

[(0.20261560265442208, u'{972ce4c6-7e08-4474-a285-3208198ce6fd}'),
 (0.05079411384123236, u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}'),
 (0.034669502705492225, u'{20a82645-c095-46ed-80e3-08825760534b}'),
 (0.031319508409727984, u'wrc@avast.com'),
 (0.027517386434025726, u'{82AF8DCA-6DE9-405D-BD5E-43525BDAD38A}'),
 (0.026336724482350176, u'{e4f94d1e-2f53-401e-8885-681602c0ddd8}'),
 (0.014881738559178957, u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}'),
 (0.01154820615184599, u'web2pdfextension@web2pdf.adobedotcom'),
 (0.010015719189278793, u'{2D3F3651-74B9-4795-BDEC-6DA2F431CB62}'),
 (0.00942711096920708, u'avg@toolbar'),
 (0.009333699323224968, u'mozilla_cc@internetdownloadmanager.com'),
 (0.008236878152164508, u'smartwebprinting@hp.com'),
 (0.007278451694311155, u'firebug@software.joehewitt.com'),
 (0.006943279989158124, u'abs@avira.com'),
 (0.006918587156511213, u'virtual_keyboard@kaspersky.com'),
 (0.0069183957392038725, u'url_advisor@kaspersky.com'),
 (0.006799334174038147, u'content_bloc