In [None]:
# default_exp experiments

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export
from functools import partial
import os
import pandas as pd

from accio import db
from accio.prototool import ProtoTool

In [None]:
from htools import eprint

# Experiments

A few utilities are provided for interacting with data collected from Experiments V1. This lets us easily acquire data from one or more snapshots.

In [None]:
# export
def snapshot_protobufs(snapshot, limit=None, verbose=True):
    """Find samples obtained via the experiments UI from a specified snapshot.
    
    Parameters
    -----------
    snapshot: int
        Number of snapshot in experiments portal.
    verbose: bool
        If True, print the number of samples retrieved.
    
    Returns
    --------
    dict
        Keys are s3 paths, values are dictionaries containing parsed protobuf
        attributes as well as data from experiment_snapshot_sites table.
    """
    q = f"""select s3_bucket_location, url, id, experiment_snapshot_id, 
        account_id, sub_account_id, org_id, ou, seq_id, created_at
        from experiment_snapshot_sites 
        where experiment_snapshot_id = {snapshot}
        """
    if limit: q += f'limit {limit}'

    # Get df w/ 1 row for each page in the sample.
    rows = db.query(q, 'experiments-prod', 'reader')
    if verbose:
        print(f'{rows.shape[0]} samples retrieved.')
    
    # Map s3 path to dict with URL and ID, then retrieve protobuf files.
    path2data = {f's3://{row[0]}/scraped_dom.proto': dict(row[1:])
                 for i, row in rows.iterrows()}
    output = ProtoTool().buf_to_dict(path2data.keys())
    
    # Add URLs and IDs to feature dicts.
    return {path: {**buf, **path2data[path]} for path, buf in output.items()}

`snapshot_protobufs` primarily serves as a helper for `fetch_experiments_data`, but at times we may want to use it alone. For a given experiment snapshot, it returns a dictionary mapping s3 file paths to dictionaries containing data from the corresponding protobuf. 

In [None]:
path2data = snapshot_protobufs(100, limit=5)
eprint(path2data.keys())

  0%|          | 0/5 [00:00<?, ?it/s]

5 samples retrieved.


  0%|          | 0/5 [00:00<?, ?it/s]

 0: s3://goguardian-experiments-prod/9e826bbc-35f6-45b8-b60e-47b32bd6f6ae/scraped_dom.proto
 1: s3://goguardian-experiments-prod/b2b7c356-e089-4fd2-bbde-273568d37c1b/scraped_dom.proto
 2: s3://goguardian-experiments-prod/fa3340c1-3e85-46c6-839d-d59fea93d0f0/scraped_dom.proto
 3: s3://goguardian-experiments-prod/f36b2a87-aa27-4623-8a74-a7980b83dd89/scraped_dom.proto
 4: s3://goguardian-experiments-prod/cd676e8b-b7a9-4529-8bea-7af294b09f0c/scraped_dom.proto





The usual protobuf fields are available in each dictionary.

In [None]:
list(path2data.values())[0].keys()

dict_keys(['fullText', 'a', 'aCount', 'p', 'pCount', 'imgSrc', 'imgCount', 'h1Count', 'h2', 'h2Count', 'div', 'divCount', 'span', 'spanCount', 'title', 'titleCount', 'liCount', 'iframeCount', 'metaContent', 'metaCount', 'scriptCount', 'formCount', 'selectCount', 'option', 'optionCount', 'inputValue', 'inputCount', 'labelCount', 'mainCount', 'headerCount', 'footerCount', 'articleCount', 'sectionCount', 'navCount', 'url', 'id', 'experiment_snapshot_id', 'account_id', 'sub_account_id', 'org_id', 'ou', 'seq_id', 'created_at'])

In [None]:
# export
def fetch_experiments_data(*snapshot_ids, df=True, s3_paths=False, limit=None):
    """
    Parameters
    -----------
    snapshot_ids: int (1 or more)
        Integer IDs corresponding to the relevant experiment snapshots.
        These can be found in the experiments V1 portal on the `Snapshots` 
        page.
    df: bool (default True)
        If True, return the data as a single Pandas DataFrame.
    s3_paths: bool (default False)
        If True, include the urls for the protobufs in S3 in the output
        dataframe or dict. When returning a dictionary (i.e. df=False),
        s3_paths will always be present as keys.
    limit: int or None
        If int, sets the max combined number of rows to retrieve from db.
        If None, retrieve all rows. Note: if using n snapshot 
        id's, each snapshot will have at most limit//n rows.
        
    Returns
    --------
    pd.DataFrame
        1 row for each example collected from snapshot.
    
    or
    
    dict[str, dict]
        Maps S3 file location to dict containing file attributes and site
        metadata.
    """
    if limit: limit //= len(snapshot_ids)
    proto_rounds = (snapshot_protobufs(i, limit) for i in snapshot_ids)
    data = {k: v for group in proto_rounds for k, v in group.items()}
    if df: 
        if s3_paths:
            data = pd.DataFrame(data).T.reset_index()\
                     .rename({'index': 's3'}, axis=1)
        else:
            data = pd.DataFrame(list(data.values()))
    return data

`fetch_experiments_data` retrieves protobuf files from one or more snapshots and combines them into a single dictionary or Pandas DataFrame.

In [None]:
fetch_experiments_data(100, limit=5)

  0%|          | 0/5 [00:00<?, ?it/s]

5 samples retrieved.


  0%|          | 0/5 [00:00<?, ?it/s]


Unnamed: 0,fullText,a,aCount,p,pCount,imgSrc,imgCount,h1Count,h2,h2Count,...,h3Count,li,textareaCount,imgAlt,h1,tableCount,tdCount,buttonCount,header,footer
0,about archives contact contributedonatetopics ...,"[about, archives, contact, contribute, donate,...",222,[being diagnosed with schizoaffective disorder...,36.0,[http://brainblogger.com/wp-content/themes/bra...,19.0,2,"[share this article, further reading, about us...",14,...,,,,,,,,,,
1,crash search chapters bookmarks history readin...,[the perks of being a wallflower],18,[you never know who’s going to get home first....,158.0,[https://ofs-e782489dbb0ef78eef561c6028030e9a....,3.0,6,,1,...,3.0,"[search, chapters, bookmarks, history, reading...",1.0,,,,,,,
2,..google search page header search modes allim...,"[images, shopping, videos, news, more, setting...",114,,1.0,[https://www.google.com/images/branding/google...,2.0,5,[web results],1,...,10.0,,1.0,[google],"[google search page header, search modes, sear...",1.0,12.0,2.0,,
3,ultimate guitar com tabs articles forums wiki ...,"[pro, soccer mommy, e a d g b e, predatorywasp...",67,,,,1.0,1,"[strumming, 1 comment]",2,...,,,3.0,[celiacastaldo],[death by chocolate chords],,,32.0,"[welcome home, stranger please register or sig...",
4,ultimate guitar com tabs articles forums wiki ...,"[pro, soccer mommy, e a d g b e, michaeldelude...",64,,,,,1,"[strumming, no comments]",2,...,,,1.0,,[death by chocolate chords],,,33.0,"[welcome home, stranger please register or sig...",[don't have an account yet? sign up]


In [None]:
fetch_experiments_data(100, 200, s3_paths=True, limit=2)

  0%|          | 0/1 [00:00<?, ?it/s]

1 samples retrieved.


  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

1 samples retrieved.





Unnamed: 0,s3,fullText,a,aCount,p,pCount,imgSrc,imgCount,h1Count,h2,...,ou,seq_id,created_at,imgAlt,h1,h3,h3Count,tableCount,tdCount,buttonCount
0,s3://goguardian-experiments-prod/9e826bbc-35f6...,about archives contact contributedonatetopics ...,"[about, archives, contact, contribute, donate,...",222,[being diagnosed with schizoaffective disorder...,36,[http://brainblogger.com/wp-content/themes/bra...,19,2,"[share this article, further reading, about us...",...,/HS/HS Staff,0,2018-08-14 00:34:35,,,,,,,
1,s3://goguardian-experiments-prod/c7cb1b02-cb00...,accessibility links skip to main content acces...,"[skip to main content, accessibility help, acc...",124,,9,[https://www.google.com/images/branding/google...,16,5,"[web results, people also ask, web results]",...,/RHCSD_Users/Students/Sperry,0,2019-05-14 04:17:01,"[google, my lai massacre, massacre at huế, pho...","[accessibility links, search modes, search res...","[my lai massacre - history, my lai massacre - ...",11.0,1.0,6.0,2.0


In [None]:
data = fetch_experiments_data(100, 200, df=False, limit=2)
print('type:', type(data))
print('length:', len(data))
print('keys:', data.keys())

  0%|          | 0/1 [00:00<?, ?it/s]

1 samples retrieved.


  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

1 samples retrieved.
type: <class 'dict'>
length: 2
keys: dict_keys(['s3://goguardian-experiments-prod/9e826bbc-35f6-45b8-b60e-47b32bd6f6ae/scraped_dom.proto', 's3://goguardian-experiments-prod/c7cb1b02-cb00-4620-990e-4420bdfeaeb1/scraped_dom.proto'])



