## Generating binary categories for training
**Steps**

- Setup categoricals for each segment based on author consensus
- Assign a binary value to each category of segment, in corresponding category columns (one-hot encoding)

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
# from sqlalchemy import create_engine
# from sqlalchemy_utils import database_exists, create_database
import psycopg2

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
#Connect to db
dbname = 'beforeiagree_db'
username = 'peterostendorp'

#Create engine
con = psycopg2.connect(database = dbname, user = username)

In [5]:
#Get annotations df
sql = """
SELECT * FROM annotations
WHERE "Policy UID" IN
(SELECT "Policy UID" FROM sites
WHERE "In 115 Set?" = TRUE);
"""
annotations = pd.read_sql_query(sql,con)

In [6]:
#Get sites df
sql = """
SELECT * FROM sites
WHERE "In 115 Set?" = TRUE;
"""
sites = pd.read_sql_query(sql,con)

In [7]:
#Get segments df
sql = """
SELECT * FROM segments
WHERE "Policy UID" IN
(SELECT "Policy UID" FROM sites
WHERE "In 115 Set?" = TRUE)
"""

segments = pd.read_sql_query(sql,con)

In [159]:
#df.groupby("date").agg({"duration": np.sum, "user_id": lambda x: x.nunique()})
#            duration  user_id
print(annotations['Policy UID'].nunique())
annotated_segments = pd.DataFrame(annotations.groupby('Policy UID').agg(lambda x: x.nunique())['segment_id'])
print(annotated_segments)
annotations.head()

115
            segment_id
Policy UID            
20                  36
21                  36
26                  68
32                  29
33                  65
58                  47
59                  13
70                  71
82                  46
93                  40
98                  41
105                 37
133                 43
135                 39
144                 31
164                 17
175                 34
186                 47
200                 43
202                 26
207                 34
228                 25
303                 54
320                 44
325                 20
331                 17
348                 62
359                 41
394                 42
414                 23
...                ...
1206                17
1221                 6
1224                19
1252                25
1259                56
1261                13
1264                14
1300                82
1306                26
1360                19
1361   

Unnamed: 0,Policy UID,annotation_id,batch_id,annotator_id,segment_id,category_name,attributes_value_pairs,date,policy_url
0,1017,20137,test_category_labeling_highlight_fordham_aaaaa,121,0,Other,"{""Other Type"": {""selectedText"": ""Sci-News.com ...",NaT,http://www.sci-news.com/privacy-policy.html
1,1017,20324,test_category_labeling_highlight_fordham_aaaaa,121,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati...",NaT,http://www.sci-news.com/privacy-policy.html
2,1017,20325,test_category_labeling_highlight_fordham_aaaaa,121,1,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""nformati...",NaT,http://www.sci-news.com/privacy-policy.html
3,1017,20326,test_category_labeling_highlight_fordham_aaaaa,121,2,Data Retention,"{""Personal Information Type"": {""selectedText"":...",NaT,http://www.sci-news.com/privacy-policy.html
4,1017,20327,test_category_labeling_highlight_fordham_aaaaa,121,3,First Party Collection/Use,"{""Collection Mode"": {""selectedText"": ""Not sele...",NaT,http://www.sci-news.com/privacy-policy.html


In [201]:
print(segments['Policy UID'].nunique())
n_segments = pd.DataFrame(segments.groupby('Policy UID').count()['segment_id'])
print(n_segments['segment_id'].sum())
print(n_segments)
segments.head()

115
6469
            segment_id
Policy UID            
20                  36
21                  36
26                  68
32                 348
33                 585
58                  47
59                  13
70                 142
82                  92
93                  40
98                  41
105                 37
133                 43
135                 39
144                186
164                 17
175                 34
186                376
200                 43
202                 26
207                 68
228                150
303                 54
320                 44
325                 20
331                 17
348                 62
359                 41
394                 42
414                 23
...                ...
1206                17
1221                 6
1224                19
1252                25
1259                56
1261                13
1264                14
1300               164
1306                26
1360                19
13

Unnamed: 0,Policy UID,segment_id,segments
0,20,0,<strong> Privacy Policy </strong> <br> <br> <s...
1,20,1,This privacy policy does not apply to Sites ma...
2,20,2,"By visiting our Sites, you are accepting the p..."
3,20,3,<strong> What Information Is Collected? </stro...
4,20,4,<strong> Personally Identifiable Information <...


In [134]:
print(sites['Policy UID'].nunique())
print(sites['Site UID'].nunique())
sites.head()

115
115


Unnamed: 0,Policy UID,Site UID,Site URL,Site Human-Readable Name,Site Check Date,In 115 Set?,Comments,Sector,Policy URL,Policy collection date,Policy last updated date,policy_text
0,20,1,theatlantic.com,The Atlantic,2016-02-08,True,"Alexa Rank: 975 (Global), 289 (US)",Arts,theatlantic.com/privacy-policy/,2015-07-02,2015-01-01,<strong> Privacy Policy </strong> <br> <br> <s...
1,21,2,imdb.com,IMDb,2016-02-08,True,"Alexa Rank: 49 (Global), 27 (US)",Arts,imdb.com/privacy,2015-07-02,2014-12-05,"IMDb Privacy Notice <br> <br>|||Last Updated, ..."
2,26,3,nytimes.com,New York Times,2016-02-08,True,"Alexa Rank: 101 (Global), 22 (US)",Arts,nytimes.com/privacy,2015-07-08,2015-06-10,<strong> Privacy Policy </strong> <br> <br> La...
3,32,4,theverge.com,The Verge,2016-02-08,True,"Alexa Rank: 525 (Global), 230 (US)",Home,voxmedia.com/privacy-policy,2015-07-02,2014-05-01,Vox Media Privacy Policy <br> <br>|||<strong> ...
4,33,5,nbc.com,NBC Universal,2016-02-08,True,"Alexa Rank: 1548 (Global), 426 (US)",Arts,nbcuniversal.com/privacy/full-privacy-policy,2015-07-02,2015-01-14,Full Privacy Policy <br> <br> Last updated: 14...


There are many cases where the number of segments in a doc far exceeds the number of annotated segments.

In [166]:
#How do annotated segments compare to total number of segments?
for i in range(0,115):
    if annotated_segments.iloc[i]['segment_id'] == n_segments.iloc[i]['segment_id']:
        print('Yes')
    else:
        print('No')

Yes
Yes
Yes
No
No
Yes
Yes
No
No
Yes
Yes
Yes
Yes
Yes
No
Yes
Yes
No
Yes
Yes
No
No
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
No
Yes
Yes
Yes
No
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
No
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
No
Yes
No
No
Yes
No
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
No
Yes
Yes
No
Yes
Yes
Yes
Yes
Yes
Yes
Yes
No
Yes
Yes
Yes
Yes
Yes
Yes
Yes
Yes
No
No
Yes


## This is the tricky bit where we join segments and annotations

In [43]:
#Join the annotations with the segments using an outer join
joined = pd.merge(annotations,segments,on=['Policy UID','segment_id'],how='outer')
joined['category_name'] = joined['category_name'].fillna(value='None')
joined = joined.drop(['batch_id','annotator_id','attributes_value_pairs','policy_url','date','annotation_id','segments'],axis=1)
#joined = seg_ind.merge(ann_ind)
print(joined.shape)
joined.head()

(40133, 3)


Unnamed: 0,Policy UID,segment_id,category_name
0,1017,0,Other
1,1017,0,Other
2,1017,0,Other
3,1017,0,Policy Change
4,1017,1,First Party Collection/Use


There are usually instances where a single segment gets multiple categories.

In [33]:
#Are there often cases where there are multiple category values for a single segment?
print(joined.groupby(['Policy UID','segment_id']).agg(lambda x: x.nunique())['category_name'])

Policy UID  segment_id
20          0             1
            1             1
            2             2
            3             1
            4             2
            5             1
            6             2
            7             1
            8             1
            9             2
            10            1
            11            2
            12            1
            13            1
            14            2
            15            1
            16            2
            17            2
            18            1
            19            1
            20            1
            21            2
            22            2
            23            3
            24            1
            25            1
            26            2
            27            2
            28            1
            29            1
                         ..
1713        59            2
            60            3
            61            1
            62           

In [52]:
#Get the mode of each segment
mode_categories = joined.groupby(['Policy UID','segment_id']).agg(lambda x: x.value_counts().index[0])
mode_categories = mode_categories.reset_index()
print(mode_categories.shape)
mode_categories.head()

(3792, 3)


Unnamed: 0,Policy UID,segment_id,category_name
0,20,0,Other
1,20,1,Other
2,20,2,Policy Change
3,20,3,First Party Collection/Use
4,20,4,First Party Collection/Use


In [59]:
#Append list of segments that were never annotated.
mode_categories = segments.merge(mode_categories,on=['Policy UID','segment_id'],how='outer')

In [60]:
categories = list(mode_categories['category_name'].unique())
print(categories)
cols = {'Other': 'other',
        'Policy Change': 'policy_change',
        'First Party Collection/Use': 'first_party_collection_use',
        'Third Party Sharing/Collection': 'third_party_sharing_collection',
        'Do Not Track': 'do_not_track',
        'User Choice/Control': 'user_choice_control',
        'International and Specific Audiences': 'international_specific_audiences',
        'Data Security': 'data_security',
        'Data Retention': 'data_retention',
        'User Access, Edit and Deletion': 'user_access_edit_deletion'}

['Other', 'Policy Change', 'First Party Collection/Use', 'Third Party Sharing/Collection', 'User Choice/Control', 'Do Not Track', 'International and Specific Audiences', 'Data Security', 'Data Retention', 'User Access, Edit and Deletion']


## Segment-level categorization
Loop through categories and generate one-hot encoding of each category and stick them into new columns with the corresponding names in cols

In [62]:
#Loop through the categories and generate a set of new columns with names in cols
binary_categories = pd.DataFrame()

for category in categories:
    one_hot = lambda s: 1 if s.startswith(category) else 0
    binary_categories[cols[category]] = mode_categories['category_name'].apply(one_hot)

In [64]:
print(binary_categories.shape)
binary_categories.head()

(6469, 10)


Unnamed: 0,other,policy_change,first_party_collection_use,third_party_sharing_collection,user_choice_control,do_not_track,international_specific_audiences,data_security,data_retention,user_access_edit_deletion
0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0


In [65]:
#Create engine for persisting
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

In [66]:
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

postgres://peterostendorp@localhost/beforeiagree_db


In [67]:
binary_categories.to_sql('binary_segment_categories',engine, if_exists='replace')

## Policy-level categorization

In [39]:
binary_categories_policy = binary_categories.reset_index(level=[0,1]).groupby('Policy UID').sum().drop('segment_id',axis=1)
binary_categories_policy = binary_categories_policy.applymap(lambda x: 1 if x > 0 else 0)
binary_categories_policy.head()

Unnamed: 0_level_0,other,policy_change,first_party_collection_use,third_party_sharing_collection,do_not_track,user_choice_control,international_specific_audiences,data_security,data_retention,user_access_edit_deletion
Policy UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20,1,1,1,1,1,1,1,1,1,1
21,1,1,1,1,0,1,1,1,0,1
26,1,1,1,1,0,1,1,1,0,1
32,1,1,1,1,0,1,1,1,1,0
33,1,1,1,1,0,1,1,1,1,1


In [40]:
binary_categories_policy.sum()

other                               112
policy_change                        83
first_party_collection_use          112
third_party_sharing_collection      110
do_not_track                         22
user_choice_control                  98
international_specific_audiences     85
data_security                        94
data_retention                       25
user_access_edit_deletion            73
dtype: int64

Most policies contain info on:
- Other
- First and third party data collection
- user choice control
- data security

Less frequently mentioned:
- data retention
- user access edit deletion
- do not track

In [41]:
binary_categories_policy.to_sql('binary_policy_categories',engine)