In [1]:
import utilities
import os
import json
import statistics
from prettytable import PrettyTable
from prettytable import ALL as ALL
# from scipy.stats import ttest_ind as ttest

In [2]:
# !pip3 install prettytable

In [30]:
DATA_DIR = '/media/umar/Data/work/voice-assistant-central/voice-assistants-openwpm/'
RESULTS_DIR = '/media/umar/Data/work/voice-assistant-central/data/results/'

### Personas
```
Connected-Car
Dating
Fashion-Style
Pets-Animals
Religion-Spirituality
SmartHome
Wine-Beverages
Health-Fitness
Navigation-TripPlanners
```

In [5]:
control_list = ['Plain', 'Amazon-Only', 'No-Skill']
test_list = ['Connected-Car', 'Dating', 'Fashion-Style', 'Pets-Animals', 'Religion-Spirituality', 'SmartHome', 'Wine-Beverages','Health-Fitness', 'Navigation-TripPlanners']

In [164]:
def get_forced_normal_websites_with_bids(bid_files):
    forced_count = 0
    normal_count = 0
    for bid_file in bid_files:
        if 'FORCED' in bid_file:
            forced_count += 1
        else:
            normal_count += 1

    return str(normal_count) + '/' + str(forced_count)


def get_websites_with_bids(bid_files):
    websites_with_bids = []
    for bid_file in bid_files:
        websites_with_bids.append(bid_file.rsplit('/', 1)[1].rsplit('_', 1)[1])

    return websites_with_bids


def return_bid_cpms(bid_files):
    bid_cpms = []
    for bid_file in bid_files:
        all_bids = utilities.read_file(bid_file)

        for bid in all_bids:
            bid_json = json.loads(bid)
            bid_cpms.append(bid_json['cpm'])
    
    return bid_cpms


def return_bid_cpms(bid_files, websites = []):
    bid_cpms = []
    
    for bid_file in bid_files:
        bid_website = bid_file.rsplit('/', 1)[1].rsplit('_', 1)[1]
        
        if len(websites) > 0 and bid_website not in websites:
            continue
            
        all_bids = utilities.read_file(bid_file)

        for bid in all_bids:
            bid_json = json.loads(bid)
            bid_cpms.append(bid_json['cpm'])
    
    return bid_cpms


def return_bid_cpms(bid_files, advertisers, websites = []):
    bid_cpms = {}
    
    for bid_file in bid_files:
        bid_website = bid_file.rsplit('/', 1)[1].rsplit('_', 1)[1]
        
        if len(websites) > 0 and bid_website not in websites:
            continue
            
        all_bids = utilities.read_file(bid_file)

        for bid in all_bids:
            bid_json = json.loads(bid)
            
            if bid_json['bidder'] not in advertisers:
                continue
            
            if bid_json['bidder'] not in bid_cpms:
                bid_cpms[bid_json['bidder']] = []
            
            bid_cpms[bid_json['bidder']].append(bid_json['cpm'])
    
    return bid_cpms


def return_median_cpms(bid_files):
    bid_cpms = []
    for bid_file in bid_files:
        all_bids = utilities.read_file(bid_file)
        
        current_bid_cpms = []
        for bid in all_bids:
            bid_json = json.loads(bid)
            current_bid_cpms.append(bid_json['cpm'])
            
        bid_cpms.append(statistics.median(current_bid_cpms))
    
    return bid_cpms


def return_median_cpms(bid_files, websites = []):
    bid_cpms = []
    for bid_file in bid_files:
        all_bids = utilities.read_file(bid_file)
        
        bid_website = bid_file.rsplit('/', 1)[1].rsplit('_', 1)[1]
        if len(websites) > 0 and bid_website not in websites:
            continue
            
        current_bid_cpms = []
        for bid in all_bids:
            bid_json = json.loads(bid)
            current_bid_cpms.append(bid_json['cpm'])
            
        bid_cpms.append(statistics.median(current_bid_cpms))
    
    return bid_cpms


def return_median_cpms(bid_files, advertisers, websites = []):
    bid_cpms = {}
    
    for bid_file in bid_files:
        all_bids = utilities.read_file(bid_file)
        
        bid_website = bid_file.rsplit('/', 1)[1].rsplit('_', 1)[1]
        if len(websites) > 0 and bid_website not in websites:
            continue
            
        current_bid_cpms = {}
        for bid in all_bids:
            bid_json = json.loads(bid)
            
            if bid_json['bidder'] not in advertisers:
                continue
            
            if bid_json['bidder'] not in bid_cpms:
                bid_cpms[bid_json['bidder']] = []
            
            if bid_json['bidder'] not in current_bid_cpms:
                current_bid_cpms[bid_json['bidder']] = []
            
            current_bid_cpms[bid_json['bidder']].append(bid_json['cpm'])
         
        for bidder in current_bid_cpms:
            bid_cpms[bidder].append(statistics.median(current_bid_cpms[bidder]))
    
    return bid_cpms


def return_common_websites(persona_list, data_dir):
    persona_dir = os.path.join(data_dir, persona_list[0] + '/website_bids/')
    common_websites = set(get_websites_with_bids(utilities.get_files_in_a_directory(persona_dir)))
    
    for persona in persona_list[1:]:
        persona_dir = os.path.join(data_dir, persona + '/website_bids/')
        candidates = get_websites_with_bids(utilities.get_files_in_a_directory(persona_dir))
        common_websites = common_websites.intersection(candidates)
        
    return common_websites


def ptable_to_csv(table, filename, headers=True):
    """Save PrettyTable results to a CSV file.

    Adapted from @AdamSmith https://stackoverflow.com/questions/32128226

    :param PrettyTable table: Table object to get data from.
    :param str filename: Filepath for the output CSV.
    :param bool headers: Whether to include the header row in the CSV.
    :return: None
    """
    raw = table.get_string()
    data = [tuple(filter(None, map(str.strip, splitline)))
            for line in raw.splitlines()
            for splitline in [line.split('|')] if len(splitline) > 1]
    if table.title is not None:
        data = data[1:]
    if not headers:
        data = data[1:]
    with open(filename, 'w') as f:
        for d in data:
            f.write('{}\n'.format(','.join(d)))

### Reading bids for personas

In [16]:
def compute_bid_statistics(persona_list, data_dir, common_websites=[], get_median=False):
    persona_stat_map = {}
    
    for persona in persona_list:
        persona_stat_map[persona] = {}
        
        persona_dir = os.path.join(data_dir, persona + '/website_bids/')
        
        bid_files = utilities.get_files_in_a_directory(persona_dir)
        persona_stat_map[persona]['normal-forced'] = get_forced_normal_websites_with_bids(bid_files)
        
        if get_median:
            cpms = return_median_cpms(bid_files, common_websites)
        else:
            cpms = return_bid_cpms(bid_files, common_websites)

        persona_stat_map[persona]['median'] = "{0:0.3f}".format(statistics.median(cpms))
        persona_stat_map[persona]['mean'] = "{0:0.3f}".format(statistics.mean(cpms))
        persona_stat_map[persona]['min'] = "{0:0.3f}".format(min(cpms))
        persona_stat_map[persona]['max'] = "{0:0.3f}".format(max(cpms))
        persona_stat_map[persona]['sd'] = "{0:0.3f}".format(statistics.stdev(cpms))

        persona_stat_map[persona]['total_bids'] = len(cpms)
        normal, forced = persona_stat_map[persona]['normal-forced'].split('/')
        persona_stat_map[persona]['total_websites'] = int(normal) + int(forced)
        
        persona_stat_map[persona]['common_websites'] = len(common_websites) if len(common_websites) > 0 else '-'

    return persona_stat_map

In [22]:
def print_bid_statistics(persona_stat_map, persona_list):
    
    table = PrettyTable(['Measure'] + persona_list, hrules=ALL)
    
    table.add_row(['# of websites'] + [persona_stat_map[persona]['total_websites'] for persona in persona_list])
    table.add_row(['Common websites'] + [persona_stat_map[persona]['common_websites'] for persona in persona_list])
    table.add_row(['Normal/Forced'] + [persona_stat_map[persona]['normal-forced'] for persona in persona_list])
    table.add_row(['Median'] + [persona_stat_map[persona]['median'] for persona in persona_list])

    table.add_row(['Mean'] + [persona_stat_map[persona]['mean'] for persona in persona_list])
    table.add_row(['SD'] + [persona_stat_map[persona]['sd'] for persona in persona_list])
    table.add_row(['Min'] + [persona_stat_map[persona]['min'] for persona in persona_list])
    table.add_row(['Max'] + [persona_stat_map[persona]['max'] for persona in persona_list])
    
    print(table)
    
    return table

## Consider all websites and all bids

In [29]:
persona_stat_map = compute_bid_statistics(control_list + test_list, DATA_DIR)
table = print_bid_statistics(persona_stat_map, control_list + test_list)
ptable_to_csv(table, RESULTS_DIR + 'all-bids.csv')

+-----------------+--------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|     Measure     | Plain  | Amazon-Only | No-Skill | Connected-Car | Dating | Fashion-Style | Pets-Animals | Religion-Spirituality | SmartHome | Wine-Beverages | Health-Fitness | Navigation-TripPlanners |
+-----------------+--------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|  # of websites  |   28   |      31     |    32    |       30      |   28   |       28      |      27      |           32          |     25    |       24       |       28       |            25           |
+-----------------+--------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+--------------

### Consider bids on websites where bids returned for all personas

In [38]:
common_websites = return_common_websites(control_list + test_list, DATA_DIR)

persona_stat_map = compute_bid_statistics(control_list + test_list, DATA_DIR, common_websites)
table = print_bid_statistics(persona_stat_map, control_list + test_list)

ptable_to_csv(table, RESULTS_DIR + 'common-bids.csv')

+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|     Measure     | Plain | Amazon-Only | No-Skill | Connected-Car | Dating | Fashion-Style | Pets-Animals | Religion-Spirituality | SmartHome | Wine-Beverages | Health-Fitness | Navigation-TripPlanners |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|  # of websites  |   28  |      31     |    32    |       30      |   28   |       28      |      27      |           32          |     25    |       24       |       28       |            25           |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+--

### Consider medain bids on websites 

In [32]:
persona_stat_map = compute_bid_statistics(control_list + test_list, DATA_DIR, get_median=True)
table = print_bid_statistics(persona_stat_map, control_list + test_list)
ptable_to_csv(table, RESULTS_DIR + 'all-median-bids.csv')

+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|     Measure     | Plain | Amazon-Only | No-Skill | Connected-Car | Dating | Fashion-Style | Pets-Animals | Religion-Spirituality | SmartHome | Wine-Beverages | Health-Fitness | Navigation-TripPlanners |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|  # of websites  |   28  |      31     |    32    |       30      |   28   |       28      |      27      |           32          |     25    |       24       |       28       |            25           |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+--

### Consider medain bids on websites where bids returned for both personas

In [37]:
common_websites = return_common_websites(control_list + test_list, DATA_DIR)

persona_stat_map = compute_bid_statistics(control_list + test_list, DATA_DIR, common_websites, get_median=True)
table = print_bid_statistics(persona_stat_map, control_list + test_list)
ptable_to_csv(table, RESULTS_DIR + 'common-median-bids.csv')

+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|     Measure     | Plain | Amazon-Only | No-Skill | Connected-Car | Dating | Fashion-Style | Pets-Animals | Religion-Spirituality | SmartHome | Wine-Beverages | Health-Fitness | Navigation-TripPlanners |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|  # of websites  |   28  |      31     |    32    |       30      |   28   |       28      |      27      |           32          |     25    |       24       |       28       |            25           |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+--

In [39]:
common_websites

{'cnet.com',
 'cnn.com',
 'howstuffworks.com',
 'indiatimes.com',
 'investopedia.com',
 'nationalgeographic.com',
 'softonic.com',
 'telegraph.co.uk',
 'theatlantic.com'}

### Get most prevalent bidders by persona

In [190]:
def get_prevalent_advertisers(persona_list, data_dir):
    advertisers = {}
    
    for persona in persona_list:
        advertisers[persona] = {}
        
        persona_dir = os.path.join(data_dir, persona + '/website_bids/')
        bid_files = utilities.get_files_in_a_directory(persona_dir)
        
        for bid_file in bid_files:
            all_bids = utilities.read_file(bid_file)
            local_advertisers = set()

            for bid in all_bids:
                bid_json = json.loads(bid)

                if bid_json['bidder'] in local_advertisers:
                    continue
                else:
                    local_advertisers.add(bid_json['bidder'])

                if bid_json['bidder'] not in advertisers[persona]:
                    advertisers[persona][bid_json['bidder']] = 0
                
                advertisers[persona][bid_json['bidder']] += 1
                
    return advertisers


def get_common_advertisers(advertisers):
    common_advertisers = set()
    
    for persona in advertisers:
        if len(common_advertisers) == 0:
            common_advertisers = set(advertisers[persona].keys())
            
        common_advertisers = common_advertisers.intersection(advertisers[persona].keys())
            
    return common_advertisers


def get_all_advertisers(advertisers):
    all_advertisers = set()
    for persona in advertisers:
         all_advertisers |= set(advertisers[persona].keys())
            
    return all_advertisers


def print_bidders(advertisers, common_advertisers):
    
    table = PrettyTable(['Advertiser'] + list(advertisers.keys()), hrules=ALL)
    
    for advertiser in common_advertisers:
        table.add_row([advertiser] + [advertisers[persona][advertiser] if advertiser in advertisers[persona] else 0 for persona in advertisers.keys()])        
    
    print(table)
    
#     return table

In [191]:
advertisers = get_prevalent_advertisers(control_list + test_list, DATA_DIR)
common_advertisers = get_common_advertisers(advertisers)
all_advertisers = get_all_advertisers(advertisers)
print_bidders(advertisers, all_advertisers)

+---------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|   Advertiser  | Plain | Amazon-Only | No-Skill | Connected-Car | Dating | Fashion-Style | Pets-Animals | Religion-Spirituality | SmartHome | Wine-Beverages | Health-Fitness | Navigation-TripPlanners |
+---------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|    pubmatic   |   4   |      7      |    8     |       4       |   7    |       4       |      5       |           4           |     4     |       3        |       7        |            3            |
+---------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+------------

In [197]:
def compute_bid_statistics(persona_list, data_dir, common_advertisers, common_websites=[], get_median=False):
    persona_stat_map = {}
#     common_advertisers.remove('pubmatic')
#     common_advertisers.remove('ix')
#     common_advertisers.remove('rubicon')
    for persona in persona_list:
        persona_stat_map[persona] = {}
        
        persona_dir = os.path.join(data_dir, persona + '/website_bids/')
        
        bid_files = utilities.get_files_in_a_directory(persona_dir)
#         persona_stat_map[persona]['normal-forced'] = get_forced_normal_websites_with_bids(bid_files, common_advertisers)
        
        if get_median:
            cpms = return_median_cpms(bid_files, common_advertisers, common_websites)
        else:
            cpms = return_bid_cpms(bid_files, common_advertisers, common_websites)

        for advertiser in common_advertisers: 
            persona_stat_map[persona][advertiser] = {}
         
            persona_stat_map[persona][advertiser]['median'] = "{0:0.3f}".format(statistics.median(cpms[advertiser]) if advertiser in cpms else 0)
            persona_stat_map[persona][advertiser]['mean'] = "{0:0.3f}".format(statistics.mean(cpms[advertiser]) if advertiser in cpms else 0)
            persona_stat_map[persona][advertiser]['min'] = "{0:0.3f}".format(min(cpms[advertiser]) if advertiser in cpms else 0)
            persona_stat_map[persona][advertiser]['max'] = "{0:0.3f}".format(max(cpms[advertiser]) if advertiser in cpms else 0)
#             persona_stat_map[persona][advertiser]['sd'] = "{0:0.3f}".format(statistics.stdev(cpms[advertiser]))

            persona_stat_map[persona][advertiser]['total_bids'] = len(cpms[advertiser]) if advertiser in cpms else 0
        
#         normal, forced = persona_stat_map[persona]['normal-forced'].split('/')
#         persona_stat_map[persona]['total_websites'] = int(normal) + int(forced)
        
#         persona_stat_map[persona]['common_websites'] =  len(common_websites) if len(common_websites) > 0 else '-'

    return persona_stat_map



def print_common_bidders(persona_stat_map, common_advertisers):

    table = PrettyTable(['Persona'] + list(common_advertisers), hrules=ALL)
    
    for persona in persona_stat_map:
        table.add_row([persona] + [persona_stat_map[persona][advertiser]['mean'] for advertiser in list(common_advertisers)])
    
    print(table)
    return table

In [200]:
persona_stat_map = compute_bid_statistics(control_list + test_list, DATA_DIR, all_advertisers, [], get_median = True)
table = print_common_bidders(persona_stat_map, all_advertisers)
ptable_to_csv(table, RESULTS_DIR + 'common_advertisers_bids.csv')

+-------------------------+----------+-------+-------+---------------+----------+--------+------------+-------+--------------+------------+-------------+--------+---------+-------+-------+-------+--------+----------+--------+--------------+----------+--------+-------+---------------+----------+--------+-----------+-------+
|         Persona         | pubmatic | ozone |  r2b2 | indexExchange | appnexus | sonobi | consumable |   ix  | verizonmedia | triplelift | emx_digital | criteo | rubicon |  ias  | openx | teads | getapp | 33across | onetag | sharethrough | criteorn | innity | nobid | smartadserver | medianet | appier | onemobile |  aol  |
+-------------------------+----------+-------+-------+---------------+----------+--------+------------+-------+--------------+------------+-------------+--------+---------+-------+-------+-------+--------+----------+--------+--------------+----------+--------+-------+---------------+----------+--------+-----------+-------+
|          Plain         

In [156]:
common_advertisers

{'criteo', 'ias', 'ix', 'onemobile', 'ozone', 'pubmatic', 'rubicon', 'teads'}

In [None]:
ttest(common_control_cpms, common_test_cpms)