**Steps**
- bring in target from binary_categories table for data encryption practices ONLY FOR sites in OPP-115
- bring in features from segment table ONLY FOR sites in OPP-115
- check for length!
- set up text processing functions
- set up BOW >> TFIDF >> Naive Bayes pipeline
- train-test-split data
- fit, predict, check classification
- wash, rinse, repeat
- pickle out trained model

In [14]:
#Basic imports
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
#Database imports and credentials
import psycopg2

In [3]:
#Establish database connection
dbname = 'beforeiagree_db'
username = 'peterostendorp'

#Create engine
con = psycopg2.connect(database = dbname, user = username)

## Target-level model

In [15]:
sql = """
SELECT "Policy UID", segment_id, data_security FROM binary_segment_categories
WHERE binary_segment_categories."Policy UID" IN
(SELECT "Policy UID" FROM sites
WHERE sites."In 115 Set?" IS TRUE);
"""

targets_segments = pd.read_sql_query(sql,con)

In [58]:
targets_segments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3792 entries, 0 to 3791
Data columns (total 3 columns):
Policy UID       3792 non-null int64
segment_id       3792 non-null int64
data_security    3792 non-null int64
dtypes: int64(3)
memory usage: 89.0 KB


In [17]:
targets_segments['data_security'].value_counts()

0    3625
1     167
Name: data_security, dtype: int64

In [18]:
targets_segments['data_security'].value_counts()[1]/targets['data_security'].count()

0.044040084388185657

Note: only about 4.4% of segments in these documents pertain to this topic.

In [19]:
targets_segments.groupby('Policy UID').sum()['data_security'].value_counts()

1    57
0    21
2    20
3     9
6     4
4     3
7     1
Name: data_security, dtype: int64

But most **documents** have at least 1 mention, if not several.

In [52]:
#Get segments associated with policies in the OPP-115 corpus
sql = """
SELECT * FROM segments
WHERE "Policy UID" IN 
(SELECT "Policy UID" FROM sites
WHERE "In 115 Set?" = TRUE)
"""

segments = pd.read_sql_query(sql,con)

In [61]:
segments.head()

Unnamed: 0,Policy UID,segment_id,segments
0,20,0,<strong> Privacy Policy </strong> <br> <br> <s...
1,20,1,This privacy policy does not apply to Sites ma...
2,20,2,"By visiting our Sites, you are accepting the p..."
3,20,3,<strong> What Information Is Collected? </stro...
4,20,4,<strong> Personally Identifiable Information <...


In [55]:
#Initial text-processing function for segments... not much needed here.
#Set up our initial text cleaning function
def text_process_segment(doc):
    """
    1. remove stopwords
    2. remove HTML tags
    """
    lst = [word for word in doc.split() if word.lower() not in stopwords.words('english')]
    return [word for word in lst if re.search(r'\<.*\>',word) is None]

In [56]:
#Set up an sklearn pipeline that processes policies, transforms them into a BOW model, applies TFIDF metric,
#then develops a Naive Bayes classifier.
data_encryption_NB_segment = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process_segment)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB())
])

In [60]:
print(segments.shape)
print(targets_segments.shape)

(6469, 3)
(3792, 3)


In [57]:
#Split data using 30%/70% split, random seed is my birthday
segments_train, segments_test, targets_train, targets_test = train_test_split(segments['segments'], 
    targets_segments['data_security'], test_size=0.3, random_state=84)

ValueError: Found input variables with inconsistent numbers of samples: [6469, 3792]

## Policy-level model
### Multinomial Naive Bayes

In [63]:
#Select category targets
sql = """
SELECT "Policy UID",data_security FROM binary_policy_categories
WHERE "Policy UID" IN 
(SELECT "Policy UID" FROM sites
WHERE "In 115 Set?" = TRUE)
"""

targets_policies = pd.read_sql_query(sql,con)

In [32]:
targets_policies.head()

Unnamed: 0,Policy UID,data_security
0,20,1
1,21,1
2,26,1
3,32,1
4,33,1


In [72]:
#Select original policies as features
sql = """
SELECT "Policy UID", policy_text FROM sites
WHERE sites."In 115 Set?" = TRUE
"""

policies = pd.read_sql_query(sql,con)

In [50]:
policies.head()

Unnamed: 0,Policy UID,policy_text
0,20,<strong> Privacy Policy </strong> <br> <br> <s...
1,21,"IMDb Privacy Notice <br> <br>|||Last Updated, ..."
2,26,<strong> Privacy Policy </strong> <br> <br> La...
3,32,Vox Media Privacy Policy <br> <br>|||<strong> ...
4,33,Full Privacy Policy <br> <br> Last updated: 14...


In [71]:
#Set up our initial text cleaning function
def text_process_policy(doc):
    """
    1. remove punctuation
    2. remove stopwords
    3. remove HTML tags
    4. remove '|||' inserted into corpus documents only
    """
    lst = [word for word in doc.split() if re.search(r'\<.*\>',word) is None]
    lst = ' '.join(lst)
    lst = [char for char in lst if char not in string.punctuation]
    lst = ''.join(lst)    
    lst = [word for word in lst.split() if word.lower() not in stopwords.words('english')]
    lst = [word for word in lst if word.replace('|||','')]
    return ' '.join(lst)

In [73]:
#Set up an sklearn pipeline that processes policies, transforms them into a BOW model, applies TFIDF metric,
#then develops a Naive Bayes classifier.
data_encryption_NB_policy = Pipeline([
    ('bow',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB())
])

In [74]:
#Split data using 30%/70% split, random seed is my birthday
policies_train, policies_test, targets_train, targets_test = train_test_split(policies['policy_text'], 
    targets_policies['data_security'], test_size=0.3, random_state=84)

In [75]:
policies_train = policies_train.map(text_process_policy)
policies_test = policies_test.map(text_process_policy)

In [76]:
policies_train.head()

4      Full Privacy Policy Last updated 14 January 20...
22     reddit privacy policy effective Apr 14 2015 pr...
14     VIPrivacy Policy Type Information Service Coll...
112    Privacy Policy updated August 26 2014 Commitme...
98     Last updated October 1 2013 Games Inca United ...
Name: policy_text, dtype: object

In [77]:
#Now fit/train the model
data_encryption_NB_policy.fit(policies_train,targets_train)

Pipeline(steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [78]:
#Predict
preds_policy = data_encryption_NB_policy.predict(policies_test)

In [79]:
#Report
print(confusion_matrix(targets_test,preds_policy))
print(classification_report(targets_test,preds_policy))

[[ 0  4]
 [ 0 31]]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         4
          1       0.89      1.00      0.94        31

avg / total       0.78      0.89      0.83        35



  'precision', 'predicted', average, warn_for)


### Adaptive boosting

In [80]:
from sklearn.ensemble import AdaBoostClassifier

In [81]:
cms = {}
reports = {}

**TO DO**
- consider different tokenizer
- bigrams?
- pass in legal dictionary

In [82]:
#Set up an sklearn pipeline that processes policies, transforms them into a BOW model, applies TFIDF metric,
#then develops an AdaBoost classifier with 100 weak learners.
data_encryption_ADA_policy = Pipeline([
    ('bow',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('classifier',AdaBoostClassifier(n_estimators=100))
])

In [83]:
#Now fit/train the model
data_encryption_ADA_policy.fit(policies_train,targets_train)

Pipeline(steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None))])

In [84]:
#Predict
preds_policy = data_encryption_ADA_policy.predict(policies_test)

In [85]:
#Report
n_estimators = data_encryption_ADA_policy.named_steps['classifier'].n_estimators
cms[n_estimators] = confusion_matrix(targets_test,preds_policy)
reports[n_estimators] = classification_report(targets_test,preds_policy)
print(cms[n_estimators])
print(reports[n_estimators])

[[ 3  1]
 [ 7 24]]
             precision    recall  f1-score   support

          0       0.30      0.75      0.43         4
          1       0.96      0.77      0.86        31

avg / total       0.88      0.77      0.81        35



In [86]:
import pickle

In [87]:
with open('../pickles/data_encryption_ADA_policy.pkl', 'wb') as file:
    pickle.dump(data_encryption_ADA_policy,file)

## Random Forest

In [88]:
from sklearn.ensemble import RandomForestClassifier

In [89]:
#Set up an sklearn pipeline that processes policies, transforms them into a BOW model, applies TFIDF metric,
#then develops an AdaBoost classifier with 100 weak learners.
data_encryption_RF_policy = Pipeline([
    ('bow',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
])

In [90]:
#Fit
data_encryption_RF_policy.fit(policies_train,targets_train)

Pipeline(steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [91]:
#Predict
preds_policy = data_encryption_RF_policy.predict(policies_test)

In [92]:
#Report
cm = confusion_matrix(targets_test,preds_policy)
cr = classification_report(targets_test,preds_policy)

In [93]:
#Balanced weighting
print(cm)
print(cr)

[[ 2  2]
 [ 4 27]]
             precision    recall  f1-score   support

          0       0.33      0.50      0.40         4
          1       0.93      0.87      0.90        31

avg / total       0.86      0.83      0.84        35



In [94]:
# with open('filename.pickle', 'wb') as handle:
#     pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../pickles/data_encryption_RF_policy.pkl', 'wb') as file:
    pickle.dump(data_encryption_RF_policy,file)