Main entrypoint for the PoliAnalyzer. Three main steps are supported and demonstrated in this notebook:

1. Use NLP pipeline to create structural representation of privacy practices in privacy policies, and construct a knowledge graph
  
2. Construct actionable formal policies based on the information, such as *app policy* for [perennial semantic Data Terms of Use](https://dl.acm.org/doi/10.1145/3589334.3645631)
  
3. Perform reasoning to check compliance of the constructed *app policy* against a user profile indicating user's preferences

## 0. Set up

Basic models and set-up steps

In [None]:
%reload_ext autoreload
%autoreload 2

# import logging
# logging.basicConfig(level=logging.INFO)

from copy import deepcopy
import os
from pathlib import Path
from pprint import pprint
from tqdm.auto import tqdm
import pp_analyze
from pp_analyze import analyze_pp, bulk_analyze_pp
from pp_analyze import kg, dtou, utils
from pp_analyze import statistics as stats
from pp_analyze import hierarchy_helper as hh
from pp_analyze import user_preference_analyze as upa
from pp_analyze import website_compliance_evaluation as wce
from pp_analyze.data_model import DataEntity, PurposeEntity

from dotenv import load_dotenv
load_dotenv()

# If you want to run Step 3, set this variable. This is the directory where the user personas are stored.
# Each user persona is a directory containing all relevant RDF files encodining information about *data policy* in psDToU.
user_persona_collection_dir = ''

## Test

Run this cell to verify that you have a working set-up for Step 1 and Step 2.

In [None]:
# s = '''
# These companies may collect information about how you use our services over time, and combine it with similar information from other services and companies.
# This information may be used to analyze and track data, determine the popularity of certain content, and under your online activity, among other things.
# We are constantly collecting and updating information about the things you like or dislike, so we can provide you with more relevant data, more relevant ads, and a better user experience.
# '''

# Be sured this has recognized data entities -- #79 in evaluation
s = '''
Location information \u2013 we collect information about your general location (such as city and country). For example, we may use the IP address to identify your general location. This information does not tell us where your device is precisely located. This information is sent as a normal part of internet traffic. In addition, we also collect implicit location information, which allows us to infer that you are either interested in a place or that you might be at the place \u2013 this information does not actually tell us where your device is precisely located.
'''

# result = analyze_pp(s, override_cache={QueryCategory.DATA_CLASSIFICATION,QueryCategory.DATA_ENTITY,QueryCategory.DATA_PRACTICE})
result = await analyze_pp(s, batch=True)
result

## 1. Analyze PP practices, and construct KG

### Analyze (query) PP practices

In [None]:
website_list = utils.get_website_list(varient='tranco', max_num=300)

# website_list = ['msn.com']


ret = await bulk_analyze_pp(website_list, override_cache=False, batch=True, max_num=100, non_breaking=True, discard_return=False)

practices_ori, failed_tasks, errors = ret

practices = deepcopy(practices_ori)

practices_ori, failed_tasks, errors


### Post-process of identified practices

Currently only one type of action is used: lifting category hierarchy.
This is useful if you do not want to use low-level concepts in the category tree, and only use concepts in upper-levels.

For example, you want `Advertising` (parent class) instead of `PersonalisedAdvertising` (a subclass, of `Advertising`).

In [None]:
practices = deepcopy(practices_ori)
# practices = {k: v for i, (k, v) in enumerate(practices.items()) if i > 50 and i < 55}

lift_to = [
    'Contacts',
    'SocialCommunication',
    'MedicalHealth',
    'Location',
    'Picture',
]
for ws, v in practices.items():
    changed = False
    for p in v:
        ichanged = p.lift(lift_to)
        changed = changed or ichanged
    if changed:
        print(f'Changed: {ws}')

# practices['facebook.com']
practices.keys()

### Select target for analysis below

If you want to use the targeted analysis showed later in the notebook, run this cell. If you only want bulk general analysis, there is no need.

In [None]:
# website_choice = list(website_list)[0]
website_choice = list(practices.keys())[0]
website_choice

# website_choice = ('facebook.com', 'Facebook')

### View statistics of identiied practices (after post-process)

In [None]:
target_practices = practices
# target_practices = {website_choice: practices[website_choice]}

# field_count = stats.calc_practice_field_count(practices[website_choice[0]])

# res = stats.calc_count_stats(field_count)
# res

all_practices = [i for ws in target_practices.values() for i in ws]

# stats.calc_practice_entity_count(practices[website_choice[0]])
# for entity_type, v in stats.calc_practice_entity_count([i for ws in practices.values() for i in ws]).items():
#     print(entity_type)
#     for entity, count in sorted(v.items(), key=lambda x: x[1], reverse=True):
#         print(f'  {entity}: {count}')


def print_for_hierarchy(hierarchy, typed_node_with_count, depth=0):
    printed = set()
    for k, v in hierarchy.items():
        if k in typed_node_with_count:
            print(' ' * depth + f'{k}: {typed_node_with_count[k]}')
            printed.add(k)
        printed.update(print_for_hierarchy(v, typed_node_with_count, depth + 1))
    if depth == 0:
        for k, v in typed_node_with_count.items():
            if k not in printed:
                print(f'{k}: {v}')
                printed.add(k)
    return printed

node_with_count = stats.calc_data_and_purpose_entity_count_with_hierarchy(all_practices, accumulate_to_parent=False)

_ = print_for_hierarchy(hh.get_data_category_hierarchy(), node_with_count[DataEntity], 0)
_ = print_for_hierarchy(hh.get_purpose_hierarchy(), node_with_count[PurposeEntity], 0)

### Convert to KG

#### Convert single

In [None]:
g = kg.convert_to_kg(practices[website_choice[0]], website_choice[0], website_choice[1])
pprint(g.serialize())

#### Convert all and store

In [None]:
_CACHE_DIR = Path(os.getenv("QUERY_CACHE_DIR")) if os.getenv("QUERY_CACHE_DIR") else None

dump_target_dir = _CACHE_DIR / 'kg'

for website in tqdm(list(practices.keys())):
    g = kg.convert_to_kg(practices[website], website, website)
    # turtle_str = g.serialize()
    g.serialize(dump_target_dir / f"{website}.ttl")

## 2. Convert to App Policy

Note that the `dtou.convert_to_app_policy()` function constructs the KG internally for each PP. So there is no need to pass KG as a variable into it.

#### Convert single

In [None]:
app_policy = dtou.convert_to_app_policy(practices[website_choice[0]], website_choice[0], website_choice[1])
pprint(app_policy.to_rdf().serialize())

#### Convert all and store

In [None]:
_CACHE_DIR = Path(os.getenv("QUERY_CACHE_DIR")) if os.getenv("QUERY_CACHE_DIR") else None

dump_target_dir = _CACHE_DIR / 'dtou'

for website in tqdm(list(practices.keys())):
    app_policy = dtou.convert_to_app_policy(practices[website], website, website)
    app_policy.to_rdf().serialize(dump_target_dir / f"{website}.ttl")

## 3. Analyze based on user profile

#### Single website

In [None]:
website_choice = 'microsoftonline.com'
res, errs = await upa.analyze_pp_with_user_persona(website_choice, website_choice, data_practices=practices[website_choice],
                                            user_persona_dir='/path-to/single/user-persona',
                                            override_cache=True)
pprint(res.serialize())
pprint(errs)

#### Multi-websites

In [None]:
from collections import defaultdict
from pathlib import Path
import asyncio
from asyncio import Semaphore
from tqdm.auto import tqdm
from pp_analyze import website_compliance_evaluation as wce

personas = wce.get_pesonas_under_dir(user_persona_collection_dir)

# personas = [
#     'allow-common-not-critical',
#     'location-no-advertising',
# ]


def print_results_all(conflicts, errors):
    if errors:
        pprint(f"There are some unexpected errors during analysis {errors}")

    websites_by_conflicts = wce.to_websites_by_num_conflicts(conflicts, practices.keys())

    pprint(("Websites with #conflicts:", websites_by_conflicts))

    for persona, res in conflicts.items():
        print(f"Personas: {persona}")
        for ws, res in res.items():
            # print(f"{ws:}")
            pprint((ws, res.serialize()), indent=2)

conflicts, all_errors = await wce.analyze_personas(personas, practices, max_concurrency=6)
print_results_all(conflicts, all_errors)

#### Check details of a website

In [None]:
pprint(conflicts[personas[0]]['microsoftonline.com'].serialize())

#### Select websites for statistics

In [None]:
ws_equal = wce.websites_with_same_pp(practices.keys())
pprint(ws_equal)

websites_of_interest = [ws for ws in website_list if ws in practices]
for ws_e in ws_equal:
    for ws in ws_e[1:]:
        websites_of_interest.remove(ws)
websites_of_interest = websites_of_interest[:100]

len(websites_of_interest)

#### Calculate statistics

In [None]:
# websites_by_conflicts = wce.to_websites_by_num_conflicts(conflicts, practices.keys())
websites_by_conflicts = wce.to_websites_by_num_conflicts(conflicts, websites_of_interest)
personas_by_conflicts = wce.to_personas_by_num_conflicts(conflicts, personas, websites_of_interest)
pprint(websites_by_conflicts)
pprint(personas_by_conflicts)

In [None]:
import pandas as pd

sc_info= wce.get_segment_conflict_info(conflicts, websites_of_interest, practices)

# out_dict = {}
# for num, websites in websites_by_conflicts.items():
#     out_dict[num] = {}
#     for ws in websites:
#         out_dict[num][ws] = sc_info[ws]
# pprint(out_dict)

out_dict = {}
for num, websites in websites_by_conflicts.items():
    out_dict[num] = {}
    for ws in websites:
        out_dict[num][ws] = [sc_info[ws][4] / sc_info[ws][5], sc_info[ws][4] / sc_info[ws][3], sc_info[ws][3] / sc_info[ws][5], sc_info[ws][0] / sc_info[ws][5], sc_info[ws][0] / sc_info[ws][3], sc_info[ws][0] / sc_info[ws][2] if sc_info[ws][0] else 0, sc_info[ws][2] / sc_info[ws][3]]
        # out_dict[num][ws] = {
        #     'practice_per_segment': sc_info[ws][4] / sc_info[ws][5],
        #     'practice_per_valid_segment': sc_info[ws][4] / sc_info[ws][3],
        #     'valid_segment_ratio': sc_info[ws][3] / sc_info[ws][5],
        #     'conflict_per_segment': sc_info[ws][0] / sc_info[ws][5],
        #     'conflict_per_valid_segment': sc_info[ws][0] / sc_info[ws][3],
        #     'conflict_per_distinct_segment': sc_info[ws][0] / sc_info[ws][2] if sc_info[ws][0] else 0,
        #     'conflicting_valid_segment_ratio': sc_info[ws][1] / sc_info[ws][3],
        #     }

out_dict2 = {}
for num, v in out_dict.items():
    out_sum = [0] * len(list(v.values())[0])
    for ws, v2 in v.items():
        for i in range(len(out_sum)):
            out_sum[i] += v2[i]
    out_sum = [i / len(v) for i in out_sum]
    out_dict2[num] = out_sum
# pprint(out_dict2)

def print_values(out_dict2):
    print('     practice_per_segment, practice_per_valid_segment, valid_segment_ratio, conflict_per_segment, conflict_per_valid_segment, conflict_per_distinct_segment, conflicting_valid_segment_ratio')
    for num, v in sorted(out_dict2.items()):
        print(f'{num:2}: ', end='')
        for i in v:
            print(f'{i:20.3f}, ', end='')
        print()

print_values(out_dict2)

number_segments = [sc_info[ws][5] for ws in websites_of_interest]
number_of_valid_segments = [sc_info[ws][3] for ws in websites_of_interest]
number_of_conflicting_segments = [sc_info[ws][2] for ws in websites_of_interest]
number_of_conflict_conflicts = [sc_info[ws][6] for ws in websites_of_interest]

print(f'Number of segments: {sum(number_segments)},  Number of valid segments: {sum(number_of_valid_segments)},  Number of conflicting segments: {sum(number_of_conflicting_segments)}, Number of conflict conflicts: {sum(number_of_conflict_conflicts)}')

In [None]:
conflict_rate = wce.calc_average_conflict_rate_by_segment_of_websites(conflicts, websites_of_interest, 5, practices)

organized_conflict_rate = {}
for num, websites in websites_by_conflicts.items():
    organized_conflict_rate[num] = {}
    for ws in websites:
        organized_conflict_rate[num][ws] = conflict_rate[ws]
pprint(organized_conflict_rate)

average_organized_conflict_rate = {}
for num, websites in organized_conflict_rate.items():
    average_organized_conflict_rate[num] = sum(websites.values()) / len(websites)
pprint(average_organized_conflict_rate)

In [None]:
# Calculate the number of conflicts of every website
num_conflicts = {persona: {ws: wce.get_number_of_conflicts(res) for ws, res in results.items() if ws in websites_of_interest} for persona, results in conflicts.items()}
# pprint(num_conflicts)

num_conflict_segments = {persona: {ws: wce.get_number_of_conflicting_segments(res) for ws, res in results.items() if ws in websites_of_interest} for persona, results in conflicts.items()}
# pprint(num_conflict_segments)
num_conflict_practices = {persona: {ws: wce.get_number_of_conflicting_practices(res, practices[ws]) for ws, res in results.items() if ws in websites_of_interest} for persona, results in conflicts.items()}

num_conflict_segments_by_website = {}
for persona, results in num_conflict_segments.items():
    for ws in practices.keys():
        if ws not in num_conflict_segments_by_website:
            num_conflict_segments_by_website[ws] = {}
        if ws in results:
            num_conflict_segments_by_website[ws][persona] = results[ws]

num_conflict_practice_by_website = {}
for persona, results in num_conflict_practices.items():
    for ws in practices.keys():
        if ws not in num_conflict_practice_by_website:
            num_conflict_practice_by_website[ws] = {}
        if ws in results:
            num_conflict_practice_by_website[ws][persona] = results[ws]


websites_and_conflict_segments_by_conflicts = {}
for count, iwebsite_list in websites_by_conflicts.items():
    websites_and_conflict_segments_by_conflicts[count] = {}
    for ws in iwebsite_list:
        websites_and_conflict_segments_by_conflicts[count][ws] = num_conflict_segments_by_website[ws]

pprint(websites_and_conflict_segments_by_conflicts)

websites_and_conflict_practices_by_conflicts = {}
for count, iwebsite_list in websites_by_conflicts.items():
    websites_and_conflict_practices_by_conflicts[count] = {}
    for ws in iwebsite_list:
        websites_and_conflict_practices_by_conflicts[count][ws] = num_conflict_practice_by_website[ws]

pprint(websites_and_conflict_practices_by_conflicts)


num_conflict_segments_by_persona = num_conflict_segments


persona_and_conflict_segments_by_conflicts = {}
for count, persona_list in personas_by_conflicts.items():
    persona_and_conflict_segments_by_conflicts[count] = {}
    for persona in persona_list:
        persona_and_conflict_segments_by_conflicts[count][persona] = num_conflict_segments_by_persona.get(persona, {})

pprint(persona_and_conflict_segments_by_conflicts)
# num_conflict_segments_by_persona['contact-ad-allow']
# personas_by_conflicts

In [None]:
%debug