In [1]:
import utilities
import os
import json
import statistics
from prettytable import PrettyTable
from prettytable import ALL as ALL
# from scipy.stats import ttest_ind as ttest

In [2]:
# !pip3 install prettytable

In [30]:
DATA_DIR = '/media/umar/Data/work/voice-assistant-central/voice-assistants-openwpm/'
RESULTS_DIR = '/media/umar/Data/work/voice-assistant-central/data/results/'

### Personas
```
Connected-Car
Dating
Fashion-Style
Pets-Animals
Religion-Spirituality
SmartHome
Wine-Beverages
Health-Fitness
Navigation-TripPlanners
```

In [5]:
control_list = ['Plain', 'Amazon-Only', 'No-Skill']
test_list = ['Connected-Car', 'Dating', 'Fashion-Style', 'Pets-Animals', 'Religion-Spirituality', 'SmartHome', 'Wine-Beverages','Health-Fitness', 'Navigation-TripPlanners']

In [26]:
def get_forced_normal_websites_with_bids(bid_files):
    forced_count = 0
    normal_count = 0
    for bid_file in bid_files:
        if 'FORCED' in bid_file:
            forced_count += 1
        else:
            normal_count += 1

    return str(normal_count) + '/' + str(forced_count)


def get_websites_with_bids(bid_files):
    websites_with_bids = []
    for bid_file in bid_files:
        websites_with_bids.append(bid_file.rsplit('/', 1)[1].rsplit('_', 1)[1])

    return websites_with_bids


def return_bid_cpms(bid_files):
    bid_cpms = []
    for bid_file in bid_files:
        all_bids = utilities.read_file(bid_file)

        for bid in all_bids:
            bid_json = json.loads(bid)
            bid_cpms.append(bid_json['cpm'])
    
    return bid_cpms


def return_bid_cpms(bid_files, websites = []):
    bid_cpms = []
    
    for bid_file in bid_files:
        bid_website = bid_file.rsplit('/', 1)[1].rsplit('_', 1)[1]
        
        if len(websites) > 0 and bid_website not in websites:
            continue
            
        all_bids = utilities.read_file(bid_file)

        for bid in all_bids:
            bid_json = json.loads(bid)
            bid_cpms.append(bid_json['cpm'])
    
    return bid_cpms


def return_median_cpms(bid_files):
    bid_cpms = []
    for bid_file in bid_files:
        all_bids = utilities.read_file(bid_file)
        
        current_bid_cpms = []
        for bid in all_bids:
            bid_json = json.loads(bid)
            current_bid_cpms.append(bid_json['cpm'])
            
        bid_cpms.append(statistics.median(current_bid_cpms))
    
    return bid_cpms


def return_median_cpms(bid_files, websites = []):
    bid_cpms = []
    for bid_file in bid_files:
        all_bids = utilities.read_file(bid_file)
        
        bid_website = bid_file.rsplit('/', 1)[1].rsplit('_', 1)[1]
        if len(websites) > 0 and bid_website not in websites:
            continue
            
        current_bid_cpms = []
        for bid in all_bids:
            bid_json = json.loads(bid)
            current_bid_cpms.append(bid_json['cpm'])
            
        bid_cpms.append(statistics.median(current_bid_cpms))
    
    return bid_cpms


def return_common_websites(persona_list, data_dir):
    persona_dir = os.path.join(data_dir, persona_list[0] + '/website_bids/')
    common_websites = set(get_websites_with_bids(utilities.get_files_in_a_directory(persona_dir)))
    
    for persona in persona_list[1:]:
        persona_dir = os.path.join(data_dir, persona + '/website_bids/')
        candidates = get_websites_with_bids(utilities.get_files_in_a_directory(persona_dir))
        common_websites = common_websites.intersection(candidates)
        
    return common_websites


def ptable_to_csv(table, filename, headers=True):
    """Save PrettyTable results to a CSV file.

    Adapted from @AdamSmith https://stackoverflow.com/questions/32128226

    :param PrettyTable table: Table object to get data from.
    :param str filename: Filepath for the output CSV.
    :param bool headers: Whether to include the header row in the CSV.
    :return: None
    """
    raw = table.get_string()
    data = [tuple(filter(None, map(str.strip, splitline)))
            for line in raw.splitlines()
            for splitline in [line.split('|')] if len(splitline) > 1]
    if table.title is not None:
        data = data[1:]
    if not headers:
        data = data[1:]
    with open(filename, 'w') as f:
        for d in data:
            f.write('{}\n'.format(','.join(d)))

### Reading bids for personas

In [16]:
def compute_bid_statistics(persona_list, data_dir, common_websites=[], get_median=False):
    persona_stat_map = {}
    
    for persona in persona_list:
        persona_stat_map[persona] = {}
        
        persona_dir = os.path.join(data_dir, persona + '/website_bids/')
        
        bid_files = utilities.get_files_in_a_directory(persona_dir)
        persona_stat_map[persona]['normal-forced'] = get_forced_normal_websites_with_bids(bid_files)
        
        if get_median:
            cpms = return_median_cpms(bid_files, common_websites)
        else:
            cpms = return_bid_cpms(bid_files, common_websites)

        persona_stat_map[persona]['median'] = "{0:0.3f}".format(statistics.median(cpms))
        persona_stat_map[persona]['mean'] = "{0:0.3f}".format(statistics.mean(cpms))
        persona_stat_map[persona]['min'] = "{0:0.3f}".format(min(cpms))
        persona_stat_map[persona]['max'] = "{0:0.3f}".format(max(cpms))
        persona_stat_map[persona]['sd'] = "{0:0.3f}".format(statistics.stdev(cpms))

        persona_stat_map[persona]['total_bids'] = len(cpms)
        normal, forced = persona_stat_map[persona]['normal-forced'].split('/')
        persona_stat_map[persona]['total_websites'] = int(normal) + int(forced)
        
        persona_stat_map[persona]['common_websites'] =  len(common_websites) if len(common_websites) > 0 else '-'

    return persona_stat_map

In [22]:
def print_bid_statistics(persona_stat_map, persona_list):
    
    table = PrettyTable(['Measure'] + persona_list, hrules=ALL)
    
    
    table.add_row(['# of websites'] + [persona_stat_map[persona]['total_websites'] for persona in persona_list])
    table.add_row(['Common websites'] + [persona_stat_map[persona]['common_websites'] for persona in persona_list])
    table.add_row(['Normal/Forced'] + [persona_stat_map[persona]['normal-forced'] for persona in persona_list])
    table.add_row(['Median'] + [persona_stat_map[persona]['median'] for persona in persona_list])

    table.add_row(['Mean'] + [persona_stat_map[persona]['mean'] for persona in persona_list])
    table.add_row(['SD'] + [persona_stat_map[persona]['sd'] for persona in persona_list])
    table.add_row(['Min'] + [persona_stat_map[persona]['min'] for persona in persona_list])
    table.add_row(['Max'] + [persona_stat_map[persona]['max'] for persona in persona_list])
    
    print(table)
    
    return table

## Consider all websites and all bids

In [29]:
persona_stat_map = compute_bid_statistics(control_list + test_list, DATA_DIR)
table = print_bid_statistics(persona_stat_map, control_list + test_list)
ptable_to_csv(table, RESULTS_DIR + 'all-bids.csv')

+-----------------+--------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|     Measure     | Plain  | Amazon-Only | No-Skill | Connected-Car | Dating | Fashion-Style | Pets-Animals | Religion-Spirituality | SmartHome | Wine-Beverages | Health-Fitness | Navigation-TripPlanners |
+-----------------+--------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|  # of websites  |   28   |      31     |    32    |       30      |   28   |       28      |      27      |           32          |     25    |       24       |       28       |            25           |
+-----------------+--------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+--------------

### Consider bids on websites where bids returned for all personas

In [38]:
common_websites = return_common_websites(control_list + test_list, DATA_DIR)

persona_stat_map = compute_bid_statistics(control_list + test_list, DATA_DIR, common_websites)
table = print_bid_statistics(persona_stat_map, control_list + test_list)

ptable_to_csv(table, RESULTS_DIR + 'common-bids.csv')

+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|     Measure     | Plain | Amazon-Only | No-Skill | Connected-Car | Dating | Fashion-Style | Pets-Animals | Religion-Spirituality | SmartHome | Wine-Beverages | Health-Fitness | Navigation-TripPlanners |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|  # of websites  |   28  |      31     |    32    |       30      |   28   |       28      |      27      |           32          |     25    |       24       |       28       |            25           |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+--

### Consider medain bids on websites 

In [32]:
persona_stat_map = compute_bid_statistics(control_list + test_list, DATA_DIR, get_median=True)
table = print_bid_statistics(persona_stat_map, control_list + test_list)
ptable_to_csv(table, RESULTS_DIR + 'all-median-bids.csv')

+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|     Measure     | Plain | Amazon-Only | No-Skill | Connected-Car | Dating | Fashion-Style | Pets-Animals | Religion-Spirituality | SmartHome | Wine-Beverages | Health-Fitness | Navigation-TripPlanners |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|  # of websites  |   28  |      31     |    32    |       30      |   28   |       28      |      27      |           32          |     25    |       24       |       28       |            25           |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+--

### Consider medain bids on websites where bids returned for both personas

In [37]:
common_websites = return_common_websites(control_list + test_list, DATA_DIR)

persona_stat_map = compute_bid_statistics(control_list + test_list, DATA_DIR, common_websites, get_median=True)
table = print_bid_statistics(persona_stat_map, control_list + test_list)
ptable_to_csv(table, RESULTS_DIR + 'common-median-bids.csv')

+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|     Measure     | Plain | Amazon-Only | No-Skill | Connected-Car | Dating | Fashion-Style | Pets-Animals | Religion-Spirituality | SmartHome | Wine-Beverages | Health-Fitness | Navigation-TripPlanners |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+-------------------------+
|  # of websites  |   28  |      31     |    32    |       30      |   28   |       28      |      27      |           32          |     25    |       24       |       28       |            25           |
+-----------------+-------+-------------+----------+---------------+--------+---------------+--------------+-----------------------+-----------+----------------+----------------+--

In [None]:
common_websites

In [None]:
for bid_file in test_bid_files:
    all_bids = utilities.read_file(bid_file)

    for bid in all_bids:
        bid_json = json.loads(bid)
        print(bid_json)
        break
    break

In [None]:
ttest(common_control_cpms, common_test_cpms)