# Merge policies with scores

In [1]:
import os, json
#os.chdir("/Users/michelle/Dropbox/Privacy/PolicyData/Clean")
with open('output_with_manual.json','r') as f:
    output = json.load(f)
    
import pandas as pd
#os.chdir("/Users/michelle/Dropbox/Privacy/PolicyData/ClassificationManual")
scores = pd.read_excel('Policy-level Evaluation.xlsx')

#takes list of strings and returns a string concatenation
def get_policy_str(policy):
    policy_str = ''
    for sentence in policy:
        policy_str += sentence
    return policy_str

# Add a progress bar
from tqdm import tqdm
# Get policy
for i,dom in enumerate(tqdm(output)):
    if (output[dom]['access'] != "None") & (output[dom]['policy_clean'] != []):
        policy = output[dom]['policy_clean']
        policy_str = get_policy_str(policy)
        output[dom]['policy_str'] = policy_str
    else:
        output[dom]['policy_str'] = None

df = pd.DataFrame.from_dict(output, orient = 'index')
df['domain'] = df.index

#Manual typo corrections
scores.loc[scores['Website Name']== "www.reshaprelifesciences.com", 'Website Name'] = 'www.reshapelifesciences.com'
scores.loc[scores['Website Name']== "www.fundus.deutscheam.com", 'Website Name'] = 'fundsus.deutscheam.com'
scores.loc[scores['Website Name']== "www.ishares.com/us/products/280771", 'Website Name'] = 'www.ishares.com/us/products/280771/'
scores.loc[scores['Website Name']== "www.globalfunds.com/GXF", 'Website Name'] = 'www.globalxfunds.com/GXF'
scores.loc[scores['Website Name']== "www.globalxfunds.com/funds/guru", 'Website Name'] = 'www.globalxfunds.com/funds/guru/'
scores.loc[scores['Website Name']== "investorlantheus.com", 'Website Name'] = 'investor.lantheus.com'
scores = scores[scores['Website Name'] != "AVERAGES"]
scores = scores[scores['Website Name'] != "www.google.com"]

#Missing "www" replacement
df2 = pd.merge(df, scores, how='right', left_on='domain', right_on='Website Name')
wrong_domains = df2[df2['domain'].isnull()]['Website Name']
corrected_domains = [new_domain.replace("www.", "") for new_domain in wrong_domains]
df['domain'] = df['domain'].replace(corrected_domains, wrong_domains)

# Merge in policy scores
df3 = pd.merge(df, scores, how='inner', left_on='domain', right_on='Website Name')

keep = ['domain', 'policy_clean', 'policy_str', 'Trash Policy', 'Paragraph Structure', 'Data Collection', 'Consent', 'Responsible Use', 'Third-Parties', 'User-Rights', 'Overall', 'Comment']
df4 = df3[keep]
df4.columns = ['domain', 'policy_clean', 'policy_str', 'trash', 'prgph_struct', 'data_collection', 'consent', 'responsible_use', 'third_parties', 'user_rights', 'overall', 'comment']
df5 = df4[df4['policy_str'].notnull()]
df5['trash'].value_counts()
df5.loc[df5['trash'] < 0, 'trash'] = 0

100%|██████████| 7269/7269 [00:00<00:00, 35732.16it/s]


In [2]:
# Trash prediction:
df5['trash'].value_counts()

0.0    427
1.0     72
Name: trash, dtype: int64

## Part 1: Trash predictions

In [3]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

stopwords = set(STOPWORDS)
stopwords.update(["will", "may"])

df5['policy_str'] = df5['policy_str'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word not in (stopwords)]))

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

def get_tokens(string):
    # remove punctuation
    newstr = re.sub(r'[^\w\s]','',string)
    # tokenize 
    tokens = word_tokenize(newstr)
    # remove stop words
    tokens = [word.lower() for word in tokens]
    return tokens

def get_lemmas(tokens):
    lemmas = [WordNetLemmatizer().lemmatize(word) for word in tokens]
    return lemmas

new_list = []
for policy in df5['policy_str']:
    words = policy.split()
    new_str = ''
    for word in words:
        tokens = get_tokens(word)
        lemma = get_lemmas(tokens)
        new_str += lemma[0]
        new_str += ' '
    new_list.append(new_str)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

df5['policy_lem_str'] = new_list
X = df5['policy_lem_str']
y = df5['trash']
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import confusion_matrix

def lr_cv(splits, X, Y, pipeline, average_method):
    # Adapted from: https://towardsdatascience.com/yet-another-twitter-sentiment-analysis-part-1-tackling-class-imbalance-4d7a7f717d44
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for train, test in kfold.split(X, Y):
        lr_fit = pipeline.fit(X[train], Y[train])
        prediction = lr_fit.predict(X[test])
        scores = lr_fit.score(X[test],Y[test])
        
        cnf_matrix = confusion_matrix(Y[test], prediction)
        np.set_printoptions(precision=2)
        print("confusion matrix:")
        print(cnf_matrix)
        
        accuracy.append(scores * 100)
        precision.append(precision_score(Y[test], prediction, average=average_method)*100)
        print('              negative    neutral     positive')
        print('precision:',precision_score(Y[test], prediction, average=None))
        recall.append(recall_score(Y[test], prediction, average=average_method)*100)
        print('recall:   ',recall_score(Y[test], prediction, average=None))
        f1.append(f1_score(Y[test], prediction, average=average_method)*100)
        print('f1 score: ',f1_score(Y[test], prediction, average=None))
        print('-'*50)
        

    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))
    print()
    
    return lr_fit, prediction, scores, Y[test]
    
lr = LogisticRegression()

original_pipeline = Pipeline([
    ('vectorizer', tfidf_vec),
    ('classifier', lr)
])

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler
X_tfidf = tfidf_vec.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=14)
lr.fit(X_train, y_train)
score = lr.score(X_test, y_test)
print("Accuracy:", score)

ROS_pipeline = make_pipeline(tfidf_vec, RandomOverSampler(random_state=777),lr)

testing_tfidf = tfidf_vec.fit_transform(X)
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X_tfidf, y)
lr_fit, y_pred, scores, y_test = lr_cv(5, X, y, ROS_pipeline, 'macro')



Accuracy: 0.88




confusion matrix:
[[79  7]
 [ 3 12]]
              negative    neutral     positive
precision: [0.96 0.63]
recall:    [0.92 0.8 ]
f1 score:  [0.94 0.71]
--------------------------------------------------
confusion matrix:
[[72 14]
 [ 0 15]]
              negative    neutral     positive
precision: [1.   0.52]
recall:    [0.84 1.  ]
f1 score:  [0.91 0.68]
--------------------------------------------------
confusion matrix:
[[84  1]
 [ 1 13]]
              negative    neutral     positive
precision: [0.99 0.93]
recall:    [0.99 0.93]
f1 score:  [0.99 0.93]
--------------------------------------------------
confusion matrix:
[[79  6]
 [ 2 12]]
              negative    neutral     positive
precision: [0.98 0.67]
recall:    [0.93 0.86]
f1 score:  [0.95 0.75]
--------------------------------------------------
confusion matrix:
[[79  6]
 [ 5  9]]
              negative    neutral     positive
precision: [0.94 0.6 ]
recall:    [0.93 0.64]
f1 score:  [0.93 0.62]
-------------------------------

### Use trash prediction model to score all other policies in the original data

In [5]:
df.head()

Unnamed: 0,access,policy,policy_clean,tokens,lemmas,policy_str,domain
2u.com,Google,[This privacy policy (the “Privacy Policy”) de...,[This privacy policy the Privacy Policy descri...,"[[privacy, policy, privacy, policy, describes,...","[[privacy, policy, privacy, policy, describes,...",This privacy policy the Privacy Policy describ...,2u.com
accel.northshoregasdelivery.com,Google,"[Please note: By using this website, you will ...",[Please note By using this website you will be...,"[[please, note, using, website, accepting, ter...","[[please, note, using, website, accepting, ter...",Please note By using this website you will be ...,accel.northshoregasdelivery.com
achievelifesciences.com,Google,"[At Achieve Life Sciences, Inc Inc. (“Achieve“...",[At Achieve Inc Achieve we are firmly committe...,"[[achieve, inc, achieve, firmly, committed, pr...","[[achieve, inc, achieve, firmly, committed, pr...",At Achieve Inc Achieve we are firmly committed...,achievelifesciences.com
acsietf.com/etf-details/,,[],[],[],[],,acsietf.com/etf-details/
advisor.fidelity.com,Google,[Fidelity Investments and the Fidelity Funds a...,[Investments and the Funds are committed to ma...,"[[investments, funds, committed, maintaining, ...","[[investment, fund, committed, maintaining, co...",Investments and the Funds are committed to mai...,advisor.fidelity.com


In [15]:
new_list = []
for policy in df['lemmas']:
    new_str = ''
    if policy != None:
        for para in policy:
            for lemma in para:
                new_str += lemma
                new_str += ' '
    new_list.append(new_str)

df['policy_lem_str'] = new_list

In [15]:
X = df['policy_lem_str']
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')
# Use the built model to predict probabilities for new (unscored) policies
probabilities = lr_fit.predict_proba(X)
df['trash_predicted_probability'] = [x[1] for x in probabilities]

In [17]:
import pandas as pd
#os.chdir("/Users/michelle/Dropbox/Privacy/PolicyData/ClassificationManual")
scores = pd.read_excel('Policy-level Evaluation.xlsx')
#Manual typo corrections
scores.loc[scores['Website Name']== "www.reshaprelifesciences.com", 'Website Name'] = 'www.reshapelifesciences.com'
scores.loc[scores['Website Name']== "www.fundus.deutscheam.com", 'Website Name'] = 'fundsus.deutscheam.com'
scores.loc[scores['Website Name']== "www.ishares.com/us/products/280771", 'Website Name'] = 'www.ishares.com/us/products/280771/'
scores.loc[scores['Website Name']== "www.globalfunds.com/GXF", 'Website Name'] = 'www.globalxfunds.com/GXF'
scores.loc[scores['Website Name']== "www.globalxfunds.com/funds/guru", 'Website Name'] = 'www.globalxfunds.com/funds/guru/'
scores.loc[scores['Website Name']== "investorlantheus.com", 'Website Name'] = 'investor.lantheus.com'
scores = scores[scores['Website Name'] != "AVERAGES"]
scores = scores[scores['Website Name'] != "www.google.com"]

In [18]:
df_final = pd.merge(df, scores, how='left', left_on='domain', right_on='Website Name')
df_final.head()

Unnamed: 0,access,policy,policy_clean,tokens,lemmas,policy_str,domain,policy_lem_str,trash_predicted_probability,Website Name,Trash Policy,Paragraph Structure,Data Collection,Consent,Responsible Use,Third-Parties,User-Rights,Overall,Comment
0,Google,[This privacy policy (the “Privacy Policy”) de...,[This privacy policy the Privacy Policy descri...,"[[privacy, policy, privacy, policy, describes,...","[[privacy, policy, privacy, policy, describes,...",This privacy policy the Privacy Policy describ...,2u.com,privacy policy privacy policy describes inc u ...,0.313021,,,,,,,,,,
1,Google,"[Please note: By using this website, you will ...",[Please note By using this website you will be...,"[[please, note, using, website, accepting, ter...","[[please, note, using, website, accepting, ter...",Please note By using this website you will be ...,accel.northshoregasdelivery.com,please note using website accepting term use a...,0.615135,accel.northshoregasdelivery.com,1.0,/,/,/,/,/,/,/,
2,Google,"[At Achieve Life Sciences, Inc Inc. (“Achieve“...",[At Achieve Inc Achieve we are firmly committe...,"[[achieve, inc, achieve, firmly, committed, pr...","[[achieve, inc, achieve, firmly, committed, pr...",At Achieve Inc Achieve we are firmly committed...,achievelifesciences.com,achieve inc achieve firmly committed protectin...,0.42026,,,,,,,,,,
3,,[],[],[],[],,acsietf.com/etf-details/,,0.790588,,,,,,,,,,
4,Google,[Fidelity Investments and the Fidelity Funds a...,[Investments and the Funds are committed to ma...,"[[investments, funds, committed, maintaining, ...","[[investment, fund, committed, maintaining, co...",Investments and the Funds are committed to mai...,advisor.fidelity.com,investment fund committed maintaining confiden...,0.357318,,,,,,,,,,


In [20]:
#Export draft
import json
with open('policies_scored.json', 'w') as outfile:
    json.dump(df_final.to_json(), outfile)

## Part 2: Overall Score Prediction

In [33]:
# Subset down to non-trash policies
df5 = df5.loc[df5['trash'] == 0]
df5 = df5.loc[df5['overall'] != "/"]
df5['overall'].value_counts()

 0    187
-1    147
 1     92
Name: overall, dtype: int64

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

model = RandomForestClassifier(n_estimators=100)
original_pipeline = Pipeline([
    ('vectorizer', tfidf_vec),
    ('classifier', model)
])

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

X = df5['policy_lem_str'].fillna(' ')
y = df5['overall']
y = [int(val) for val in y]
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')
X_tfidf = tfidf_vec.fit_transform(X.values.astype('str'))

ROS_pipeline = make_pipeline(tfidf_vec, RandomOverSampler(random_state=777),model)
testing_tfidf = tfidf_vec.fit_transform(X)
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X_tfidf, y)

splits = 5 
Y = pd.Series(y)

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
accuracy = []
precision = []
recall = []
f1 = []

def lr_cv(splits, X_tfidf, Y, pipeline, average_method):
    for train, test in kfold.split(X_tfidf, Y):

        lr_fit = model.fit(X_tfidf[train], Y[train])
        prediction = lr_fit.predict(X_tfidf[test])
        scores = lr_fit.score(X_tfidf[test],Y[test])

        cnf_matrix = confusion_matrix(Y[test], prediction)
        np.set_printoptions(precision=2)
        print("confusion matrix:")
        print(cnf_matrix)

        accuracy.append(scores * 100)
        precision.append(precision_score(Y[test], prediction, average='macro')*100)
        print('              negative    neutral     positive')
        print('precision:',precision_score(Y[test], prediction, average=None))
        recall.append(recall_score(Y[test], prediction, average='macro')*100)
        print('recall:   ',recall_score(Y[test], prediction, average=None))
        f1.append(f1_score(Y[test], prediction, average='macro')*100)
        print('f1 score: ',f1_score(Y[test], prediction, average=None))
        print('-'*50)


    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))
    print()

    return lr_fit, prediction, scores, Y[test]

In [63]:
model = RandomForestClassifier(n_estimators=100)
original_pipeline = Pipeline([
    ('vectorizer', tfidf_vec),
    ('classifier', model)
])

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler
#X_tfidf = tfidf_vec.fit_transform(X)
ROS_pipeline = make_pipeline(tfidf_vec, RandomOverSampler(random_state=777),model)
testing_tfidf = tfidf_vec.fit_transform(X)
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X_tfidf, y)
lr_fit, y_pred, scores, y_test = lr_cv(5, X_tfidf, Y, ROS_pipeline, 'macro')

#testing_tfidf = tfidf_vec.fit_transform(X)
#ros = RandomOverSampler(random_state=777)
#X_ROS, y_ROS = ros.fit_sample(X_tfidf, y)
#lr_fit, y_pred, scores, y_test = lr_cv(5, X, y, ROS_pipeline, 'macro')

confusion matrix:
[[16 13  1]
 [ 5 32  1]
 [ 3 11  5]]
              negative    neutral     positive
precision: [0.67 0.57 0.71]
recall:    [0.53 0.84 0.26]
f1 score:  [0.59 0.68 0.38]
--------------------------------------------------
confusion matrix:
[[15 14  1]
 [ 6 30  2]
 [ 0 17  2]]
              negative    neutral     positive
precision: [0.71 0.49 0.4 ]
recall:    [0.5  0.79 0.11]
f1 score:  [0.59 0.61 0.17]
--------------------------------------------------
confusion matrix:
[[13 15  1]
 [ 8 27  2]
 [ 1 14  3]]
              negative    neutral     positive
precision: [0.59 0.48 0.5 ]
recall:    [0.45 0.73 0.17]
f1 score:  [0.51 0.58 0.25]
--------------------------------------------------
confusion matrix:
[[22  5  2]
 [ 3 28  6]
 [ 0 13  5]]
              negative    neutral     positive
precision: [0.88 0.61 0.38]
recall:    [0.76 0.76 0.28]
f1 score:  [0.81 0.67 0.32]
--------------------------------------------------
confusion matrix:
[[15 14  0]
 [ 6 31  0]
 [ 0 14  4

In [64]:
X = df['policy_lem_str']
X_tfidf = tfidf_vec.transform(X.values.astype('str'))

In [65]:
X_tfidf[0]

<1x86989 sparse matrix of type '<class 'numpy.float64'>'
	with 6580 stored elements in Compressed Sparse Row format>

In [66]:
lr_fit.predict_proba(X_tfidf[:5])

array([[0.34, 0.5 , 0.16],
       [0.41, 0.34, 0.25],
       [0.53, 0.36, 0.11],
       [0.84, 0.15, 0.01],
       [0.37, 0.48, 0.15]])

In [68]:
# Use the built model to predict probabilities for new (unscored) policies
#Here, 0 is -1, 1 is 0, and 2 is 1 (can't figure out how to change the labels)
probabilities = lr_fit.predict_proba(X_tfidf)
df['score_overall_good_predicted_probability'] = [x[2] for x in probabilities]
df['score_overall_bad_predicted_probability'] = [x[0] for x in probabilities]
df['score_overall_neutral_predicted_probability'] = [x[1] for x in probabilities]

In [69]:
#Get scores df back:
import pandas as pd
#os.chdir("/Users/michelle/Dropbox/Privacy/PolicyData/ClassificationManual")
scores = pd.read_excel('Policy-level Evaluation.xlsx')
#Manual typo corrections
scores.loc[scores['Website Name']== "www.reshaprelifesciences.com", 'Website Name'] = 'www.reshapelifesciences.com'
scores.loc[scores['Website Name']== "www.fundus.deutscheam.com", 'Website Name'] = 'fundsus.deutscheam.com'
scores.loc[scores['Website Name']== "www.ishares.com/us/products/280771", 'Website Name'] = 'www.ishares.com/us/products/280771/'
scores.loc[scores['Website Name']== "www.globalfunds.com/GXF", 'Website Name'] = 'www.globalxfunds.com/GXF'
scores.loc[scores['Website Name']== "www.globalxfunds.com/funds/guru", 'Website Name'] = 'www.globalxfunds.com/funds/guru/'
scores.loc[scores['Website Name']== "investorlantheus.com", 'Website Name'] = 'investor.lantheus.com'
scores = scores[scores['Website Name'] != "AVERAGES"]
scores = scores[scores['Website Name'] != "www.google.com"]
scores_df = scores

df_final = pd.merge(df, scores_df, how='left', left_on='domain', right_on='Website Name')
df_final.head()

Unnamed: 0,access,policy,policy_clean,tokens,lemmas,policy_str,domain,policy_lem_str,trash_predicted_probability,score_overall_good_predicted_probability,...,Website Name,Trash Policy,Paragraph Structure,Data Collection,Consent,Responsible Use,Third-Parties,User-Rights,Overall,Comment
0,Google,[This privacy policy (the “Privacy Policy”) de...,[This privacy policy the Privacy Policy descri...,"[[privacy, policy, privacy, policy, describes,...","[[privacy, policy, privacy, policy, describes,...",This privacy policy the Privacy Policy describ...,2u.com,privacy policy privacy policy describes inc u ...,0.313021,0.16,...,,,,,,,,,,
1,Google,"[Please note: By using this website, you will ...",[Please note By using this website you will be...,"[[please, note, using, website, accepting, ter...","[[please, note, using, website, accepting, ter...",Please note By using this website you will be ...,accel.northshoregasdelivery.com,please note using website accepting term use a...,0.615135,0.25,...,accel.northshoregasdelivery.com,1.0,/,/,/,/,/,/,/,
2,Google,"[At Achieve Life Sciences, Inc Inc. (“Achieve“...",[At Achieve Inc Achieve we are firmly committe...,"[[achieve, inc, achieve, firmly, committed, pr...","[[achieve, inc, achieve, firmly, committed, pr...",At Achieve Inc Achieve we are firmly committed...,achievelifesciences.com,achieve inc achieve firmly committed protectin...,0.42026,0.11,...,,,,,,,,,,
3,,[],[],[],[],,acsietf.com/etf-details/,,0.790588,0.01,...,,,,,,,,,,
4,Google,[Fidelity Investments and the Fidelity Funds a...,[Investments and the Funds are committed to ma...,"[[investments, funds, committed, maintaining, ...","[[investment, fund, committed, maintaining, co...",Investments and the Funds are committed to mai...,advisor.fidelity.com,investment fund committed maintaining confiden...,0.357318,0.15,...,,,,,,,,,,


## Part 3: 'prgph_struct', 'data_collection', 'consent', 'responsible_use', 'third_parties', 'user_rights'

#### Start with third parties

In [70]:
# Look at third parties scores
df5['third_parties'].value_counts()

 0    242
 1     94
-1     90
Name: third_parties, dtype: int64

In [71]:
model = RandomForestClassifier(n_estimators=100)
original_pipeline = Pipeline([
    ('vectorizer', tfidf_vec),
    ('classifier', model)
])

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

X = df5['policy_lem_str'].fillna(' ')
y = df5['third_parties']
y = [int(val) for val in y]
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')
X_tfidf = tfidf_vec.fit_transform(X.values.astype('str'))

ROS_pipeline = make_pipeline(tfidf_vec, RandomOverSampler(random_state=777),model)
testing_tfidf = tfidf_vec.fit_transform(X)
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X_tfidf, y)

splits = 5 
Y = pd.Series(y)

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
accuracy = []
precision = []
recall = []
f1 = []

In [72]:
lr_fit, y_pred, scores, y_test = lr_cv(5, X_tfidf, Y, ROS_pipeline, 'macro')

confusion matrix:
[[ 5 13  0]
 [ 1 46  2]
 [ 0 19  0]]
              negative    neutral     positive
precision: [0.83 0.59 0.  ]
recall:    [0.28 0.94 0.  ]
f1 score:  [0.42 0.72 0.  ]
--------------------------------------------------
confusion matrix:
[[ 3 15  0]
 [ 1 47  1]
 [ 0 19  0]]
              negative    neutral     positive
precision: [0.75 0.58 0.  ]
recall:    [0.17 0.96 0.  ]
f1 score:  [0.27 0.72 0.  ]
--------------------------------------------------
confusion matrix:
[[ 7 11  0]
 [ 4 44  0]
 [ 0 19  0]]
              negative    neutral     positive
precision: [0.64 0.59 0.  ]
recall:    [0.39 0.92 0.  ]
f1 score:  [0.48 0.72 0.  ]
--------------------------------------------------


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


confusion matrix:
[[ 6 12  0]
 [ 1 47  0]
 [ 0 19  0]]
              negative    neutral     positive
precision: [0.86 0.6  0.  ]
recall:    [0.33 0.98 0.  ]
f1 score:  [0.48 0.75 0.  ]
--------------------------------------------------


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


confusion matrix:
[[ 7 11  0]
 [ 3 45  0]
 [ 0 18  0]]
              negative    neutral     positive
precision: [0.7  0.61 0.  ]
recall:    [0.39 0.94 0.  ]
f1 score:  [0.5  0.74 0.  ]
--------------------------------------------------
accuracy: 60.34% (+/- 1.58%)
precision: 45.01% (+/- 2.74%)
recall: 41.91% (+/- 2.54%)
f1 score: 38.70% (+/- 2.97%)



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [73]:
X = df['policy_lem_str']
X_tfidf = tfidf_vec.transform(X.values.astype('str'))
# Use the built model to predict probabilities for new (unscored) policies
#Here, 0 is -1, 1 is 0, and 2 is 1 (can't figure out how to change the labels)
probabilities = lr_fit.predict_proba(X_tfidf)
df['score_third_party_good_predicted_probability'] = [x[2] for x in probabilities]
df['score_third_party_bad_predicted_probability'] = [x[0] for x in probabilities]
df['score_third_party_neutral_predicted_probability'] = [x[1] for x in probabilities]

In [74]:
df_final = pd.merge(df, scores_df, how='left', left_on='domain', right_on='Website Name')
df_final.head()

Unnamed: 0,access,policy,policy_clean,tokens,lemmas,policy_str,domain,policy_lem_str,trash_predicted_probability,score_overall_good_predicted_probability,...,Website Name,Trash Policy,Paragraph Structure,Data Collection,Consent,Responsible Use,Third-Parties,User-Rights,Overall,Comment
0,Google,[This privacy policy (the “Privacy Policy”) de...,[This privacy policy the Privacy Policy descri...,"[[privacy, policy, privacy, policy, describes,...","[[privacy, policy, privacy, policy, describes,...",This privacy policy the Privacy Policy describ...,2u.com,privacy policy privacy policy describes inc u ...,0.313021,0.16,...,,,,,,,,,,
1,Google,"[Please note: By using this website, you will ...",[Please note By using this website you will be...,"[[please, note, using, website, accepting, ter...","[[please, note, using, website, accepting, ter...",Please note By using this website you will be ...,accel.northshoregasdelivery.com,please note using website accepting term use a...,0.615135,0.25,...,accel.northshoregasdelivery.com,1.0,/,/,/,/,/,/,/,
2,Google,"[At Achieve Life Sciences, Inc Inc. (“Achieve“...",[At Achieve Inc Achieve we are firmly committe...,"[[achieve, inc, achieve, firmly, committed, pr...","[[achieve, inc, achieve, firmly, committed, pr...",At Achieve Inc Achieve we are firmly committed...,achievelifesciences.com,achieve inc achieve firmly committed protectin...,0.42026,0.11,...,,,,,,,,,,
3,,[],[],[],[],,acsietf.com/etf-details/,,0.790588,0.01,...,,,,,,,,,,
4,Google,[Fidelity Investments and the Fidelity Funds a...,[Investments and the Funds are committed to ma...,"[[investments, funds, committed, maintaining, ...","[[investment, fund, committed, maintaining, co...",Investments and the Funds are committed to mai...,advisor.fidelity.com,investment fund committed maintaining confiden...,0.357318,0.15,...,,,,,,,,,,


In [75]:
# Can't do much with this...
df5['prgph_struct'].value_counts()

1    425
0      1
Name: prgph_struct, dtype: int64

In [76]:
df5['data_collection'].value_counts()

 0    261
-1     91
 1     74
Name: data_collection, dtype: int64

In [77]:
model = RandomForestClassifier(n_estimators=100)
original_pipeline = Pipeline([
    ('vectorizer', tfidf_vec),
    ('classifier', model)
])

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

X = df5['policy_lem_str'].fillna(' ')
y = df5['data_collection']
y = [int(val) for val in y]
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')
X_tfidf = tfidf_vec.fit_transform(X.values.astype('str'))

ROS_pipeline = make_pipeline(tfidf_vec, RandomOverSampler(random_state=777),model)
testing_tfidf = tfidf_vec.fit_transform(X)
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X_tfidf, y)

splits = 5 
Y = pd.Series(y)

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
accuracy = []
precision = []
recall = []
f1 = []
lr_fit, y_pred, scores, y_test = lr_cv(5, X_tfidf, Y, ROS_pipeline, 'macro')

confusion matrix:
[[ 2 17  0]
 [ 0 52  1]
 [ 0 15  0]]
              negative    neutral     positive
precision: [1.   0.62 0.  ]
recall:    [0.11 0.98 0.  ]
f1 score:  [0.19 0.76 0.  ]
--------------------------------------------------
confusion matrix:
[[ 2 15  1]
 [ 5 47  0]
 [ 2 13  0]]
              negative    neutral     positive
precision: [0.22 0.63 0.  ]
recall:    [0.11 0.9  0.  ]
f1 score:  [0.15 0.74 0.  ]
--------------------------------------------------
confusion matrix:
[[ 2 16  0]
 [ 1 51  0]
 [ 0 15  0]]
              negative    neutral     positive
precision: [0.67 0.62 0.  ]
recall:    [0.11 0.98 0.  ]
f1 score:  [0.19 0.76 0.  ]
--------------------------------------------------


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


confusion matrix:
[[ 0 16  2]
 [ 1 50  1]
 [ 0 14  1]]
              negative    neutral     positive
precision: [0.   0.62 0.25]
recall:    [0.   0.96 0.07]
f1 score:  [0.   0.76 0.11]
--------------------------------------------------
confusion matrix:
[[ 0 17  1]
 [ 0 51  1]
 [ 0 14  0]]
              negative    neutral     positive
precision: [0.   0.62 0.  ]
recall:    [0.   0.98 0.  ]
f1 score:  [0.   0.76 0.  ]
--------------------------------------------------
accuracy: 60.56% (+/- 1.69%)
precision: 35.02% (+/- 11.88%)
recall: 34.68% (+/- 1.42%)
f1 score: 29.42% (+/- 2.33%)



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [78]:
X = df['policy_lem_str']
X_tfidf = tfidf_vec.transform(X.values.astype('str'))
# Use the built model to predict probabilities for new (unscored) policies
#Here, 0 is -1, 1 is 0, and 2 is 1 (can't figure out how to change the labels)
probabilities = lr_fit.predict_proba(X_tfidf)
df['score_data_collection_good_predicted_probability'] = [x[2] for x in probabilities]
df['score_data_collection_bad_predicted_probability'] = [x[0] for x in probabilities]
df['score_data_collection_neutral_predicted_probability'] = [x[1] for x in probabilities]
df_final = pd.merge(df, scores_df, how='left', left_on='domain', right_on='Website Name')
df_final.head()

Unnamed: 0,access,policy,policy_clean,tokens,lemmas,policy_str,domain,policy_lem_str,trash_predicted_probability,score_overall_good_predicted_probability,...,Website Name,Trash Policy,Paragraph Structure,Data Collection,Consent,Responsible Use,Third-Parties,User-Rights,Overall,Comment
0,Google,[This privacy policy (the “Privacy Policy”) de...,[This privacy policy the Privacy Policy descri...,"[[privacy, policy, privacy, policy, describes,...","[[privacy, policy, privacy, policy, describes,...",This privacy policy the Privacy Policy describ...,2u.com,privacy policy privacy policy describes inc u ...,0.313021,0.16,...,,,,,,,,,,
1,Google,"[Please note: By using this website, you will ...",[Please note By using this website you will be...,"[[please, note, using, website, accepting, ter...","[[please, note, using, website, accepting, ter...",Please note By using this website you will be ...,accel.northshoregasdelivery.com,please note using website accepting term use a...,0.615135,0.25,...,accel.northshoregasdelivery.com,1.0,/,/,/,/,/,/,/,
2,Google,"[At Achieve Life Sciences, Inc Inc. (“Achieve“...",[At Achieve Inc Achieve we are firmly committe...,"[[achieve, inc, achieve, firmly, committed, pr...","[[achieve, inc, achieve, firmly, committed, pr...",At Achieve Inc Achieve we are firmly committed...,achievelifesciences.com,achieve inc achieve firmly committed protectin...,0.42026,0.11,...,,,,,,,,,,
3,,[],[],[],[],,acsietf.com/etf-details/,,0.790588,0.01,...,,,,,,,,,,
4,Google,[Fidelity Investments and the Fidelity Funds a...,[Investments and the Funds are committed to ma...,"[[investments, funds, committed, maintaining, ...","[[investment, fund, committed, maintaining, co...",Investments and the Funds are committed to mai...,advisor.fidelity.com,investment fund committed maintaining confiden...,0.357318,0.15,...,,,,,,,,,,


In [79]:
# Consent
df5['consent'].value_counts()

 0    341
-1     59
 1     26
Name: consent, dtype: int64

In [80]:
model = RandomForestClassifier(n_estimators=100)
original_pipeline = Pipeline([
    ('vectorizer', tfidf_vec),
    ('classifier', model)
])

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

X = df5['policy_lem_str'].fillna(' ')
y = df5['consent']
y = [int(val) for val in y]
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')
X_tfidf = tfidf_vec.fit_transform(X.values.astype('str'))

ROS_pipeline = make_pipeline(tfidf_vec, RandomOverSampler(random_state=777),model)
testing_tfidf = tfidf_vec.fit_transform(X)
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X_tfidf, y)

splits = 5 
Y = pd.Series(y)

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
accuracy = []
precision = []
recall = []
f1 = []
lr_fit, y_pred, scores, y_test = lr_cv(5, X_tfidf, Y, ROS_pipeline, 'macro')

confusion matrix:
[[ 0 12  0]
 [ 0 69  0]
 [ 0  6  0]]
              negative    neutral     positive
precision: [0.   0.79 0.  ]
recall:    [0. 1. 0.]
f1 score:  [0.   0.88 0.  ]
--------------------------------------------------


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


confusion matrix:
[[ 0 12  0]
 [ 0 68  0]
 [ 0  5  0]]
              negative    neutral     positive
precision: [0.  0.8 0. ]
recall:    [0. 1. 0.]
f1 score:  [0.   0.89 0.  ]
--------------------------------------------------


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


confusion matrix:
[[ 0 12  0]
 [ 0 68  0]
 [ 0  5  0]]
              negative    neutral     positive
precision: [0.  0.8 0. ]
recall:    [0. 1. 0.]
f1 score:  [0.   0.89 0.  ]
--------------------------------------------------


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


confusion matrix:
[[ 1 11  0]
 [ 0 68  0]
 [ 0  5  0]]
              negative    neutral     positive
precision: [1.   0.81 0.  ]
recall:    [0.08 1.   0.  ]
f1 score:  [0.15 0.89 0.  ]
--------------------------------------------------


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


confusion matrix:
[[ 0 11  0]
 [ 0 68  0]
 [ 0  5  0]]
              negative    neutral     positive
precision: [0.   0.81 0.  ]
recall:    [0. 1. 0.]
f1 score:  [0.   0.89 0.  ]
--------------------------------------------------
accuracy: 80.29% (+/- 0.69%)
precision: 33.41% (+/- 13.45%)
recall: 33.89% (+/- 1.11%)
f1 score: 30.70% (+/- 2.13%)



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [81]:
X = df['policy_lem_str']
X_tfidf = tfidf_vec.transform(X.values.astype('str'))
# Use the built model to predict probabilities for new (unscored) policies
#Here, 0 is -1, 1 is 0, and 2 is 1 (can't figure out how to change the labels)
probabilities = lr_fit.predict_proba(X_tfidf)
df['score_consent_good_predicted_probability'] = [x[2] for x in probabilities]
df['score_consent_bad_predicted_probability'] = [x[0] for x in probabilities]
df['score_consent_neutral_predicted_probability'] = [x[1] for x in probabilities]
df_final = pd.merge(df, scores_df, how='left', left_on='domain', right_on='Website Name')
df_final.head()

Unnamed: 0,access,policy,policy_clean,tokens,lemmas,policy_str,domain,policy_lem_str,trash_predicted_probability,score_overall_good_predicted_probability,...,Website Name,Trash Policy,Paragraph Structure,Data Collection,Consent,Responsible Use,Third-Parties,User-Rights,Overall,Comment
0,Google,[This privacy policy (the “Privacy Policy”) de...,[This privacy policy the Privacy Policy descri...,"[[privacy, policy, privacy, policy, describes,...","[[privacy, policy, privacy, policy, describes,...",This privacy policy the Privacy Policy describ...,2u.com,privacy policy privacy policy describes inc u ...,0.313021,0.16,...,,,,,,,,,,
1,Google,"[Please note: By using this website, you will ...",[Please note By using this website you will be...,"[[please, note, using, website, accepting, ter...","[[please, note, using, website, accepting, ter...",Please note By using this website you will be ...,accel.northshoregasdelivery.com,please note using website accepting term use a...,0.615135,0.25,...,accel.northshoregasdelivery.com,1.0,/,/,/,/,/,/,/,
2,Google,"[At Achieve Life Sciences, Inc Inc. (“Achieve“...",[At Achieve Inc Achieve we are firmly committe...,"[[achieve, inc, achieve, firmly, committed, pr...","[[achieve, inc, achieve, firmly, committed, pr...",At Achieve Inc Achieve we are firmly committed...,achievelifesciences.com,achieve inc achieve firmly committed protectin...,0.42026,0.11,...,,,,,,,,,,
3,,[],[],[],[],,acsietf.com/etf-details/,,0.790588,0.01,...,,,,,,,,,,
4,Google,[Fidelity Investments and the Fidelity Funds a...,[Investments and the Funds are committed to ma...,"[[investments, funds, committed, maintaining, ...","[[investment, fund, committed, maintaining, co...",Investments and the Funds are committed to mai...,advisor.fidelity.com,investment fund committed maintaining confiden...,0.357318,0.15,...,,,,,,,,,,


In [82]:
df5['responsible_use'].value_counts()

 0    236
 1     99
-1     91
Name: responsible_use, dtype: int64

In [83]:
model = RandomForestClassifier(n_estimators=100)
original_pipeline = Pipeline([
    ('vectorizer', tfidf_vec),
    ('classifier', model)
])

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

X = df5['policy_lem_str'].fillna(' ')
y = df5['responsible_use']
y = [int(val) for val in y]
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')
X_tfidf = tfidf_vec.fit_transform(X.values.astype('str'))

ROS_pipeline = make_pipeline(tfidf_vec, RandomOverSampler(random_state=777),model)
testing_tfidf = tfidf_vec.fit_transform(X)
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X_tfidf, y)

splits = 5 
Y = pd.Series(y)

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
accuracy = []
precision = []
recall = []
f1 = []
lr_fit, y_pred, scores, y_test = lr_cv(5, X_tfidf, Y, ROS_pipeline, 'macro')

confusion matrix:
[[ 3 15  1]
 [ 0 47  1]
 [ 0 19  1]]
              negative    neutral     positive
precision: [1.   0.58 0.33]
recall:    [0.16 0.98 0.05]
f1 score:  [0.27 0.73 0.09]
--------------------------------------------------
confusion matrix:
[[ 2 16  0]
 [ 1 45  1]
 [ 0 20  0]]
              negative    neutral     positive
precision: [0.67 0.56 0.  ]
recall:    [0.11 0.96 0.  ]
f1 score:  [0.19 0.7  0.  ]
--------------------------------------------------
confusion matrix:
[[ 1 17  0]
 [ 1 46  0]
 [ 0 18  2]]
              negative    neutral     positive
precision: [0.5  0.57 1.  ]
recall:    [0.06 0.98 0.1 ]
f1 score:  [0.1  0.72 0.18]
--------------------------------------------------
confusion matrix:
[[ 2 15  1]
 [ 0 44  3]
 [ 0 19  1]]
              negative    neutral     positive
precision: [1.   0.56 0.2 ]
recall:    [0.11 0.94 0.05]
f1 score:  [0.2  0.7  0.08]
--------------------------------------------------
confusion matrix:
[[ 0 18  0]
 [ 0 47  0]
 [ 0 19  0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [84]:
X = df['policy_lem_str']
X_tfidf = tfidf_vec.transform(X.values.astype('str'))
# Use the built model to predict probabilities for new (unscored) policies
#Here, 0 is -1, 1 is 0, and 2 is 1 (can't figure out how to change the labels)
probabilities = lr_fit.predict_proba(X_tfidf)
df['score_responsible_use_good_predicted_probability'] = [x[2] for x in probabilities]
df['score_responsible_use_bad_predicted_probability'] = [x[0] for x in probabilities]
df['score_responsible_use_neutral_predicted_probability'] = [x[1] for x in probabilities]
df_final = pd.merge(df, scores_df, how='left', left_on='domain', right_on='Website Name')
df_final.head()

Unnamed: 0,access,policy,policy_clean,tokens,lemmas,policy_str,domain,policy_lem_str,trash_predicted_probability,score_overall_good_predicted_probability,...,Website Name,Trash Policy,Paragraph Structure,Data Collection,Consent,Responsible Use,Third-Parties,User-Rights,Overall,Comment
0,Google,[This privacy policy (the “Privacy Policy”) de...,[This privacy policy the Privacy Policy descri...,"[[privacy, policy, privacy, policy, describes,...","[[privacy, policy, privacy, policy, describes,...",This privacy policy the Privacy Policy describ...,2u.com,privacy policy privacy policy describes inc u ...,0.313021,0.16,...,,,,,,,,,,
1,Google,"[Please note: By using this website, you will ...",[Please note By using this website you will be...,"[[please, note, using, website, accepting, ter...","[[please, note, using, website, accepting, ter...",Please note By using this website you will be ...,accel.northshoregasdelivery.com,please note using website accepting term use a...,0.615135,0.25,...,accel.northshoregasdelivery.com,1.0,/,/,/,/,/,/,/,
2,Google,"[At Achieve Life Sciences, Inc Inc. (“Achieve“...",[At Achieve Inc Achieve we are firmly committe...,"[[achieve, inc, achieve, firmly, committed, pr...","[[achieve, inc, achieve, firmly, committed, pr...",At Achieve Inc Achieve we are firmly committed...,achievelifesciences.com,achieve inc achieve firmly committed protectin...,0.42026,0.11,...,,,,,,,,,,
3,,[],[],[],[],,acsietf.com/etf-details/,,0.790588,0.01,...,,,,,,,,,,
4,Google,[Fidelity Investments and the Fidelity Funds a...,[Investments and the Funds are committed to ma...,"[[investments, funds, committed, maintaining, ...","[[investment, fund, committed, maintaining, co...",Investments and the Funds are committed to mai...,advisor.fidelity.com,investment fund committed maintaining confiden...,0.357318,0.15,...,,,,,,,,,,


In [85]:
df5['user_rights'].value_counts()

-1    214
 1    111
 0    101
Name: user_rights, dtype: int64

In [86]:
model = RandomForestClassifier(n_estimators=100)
original_pipeline = Pipeline([
    ('vectorizer', tfidf_vec),
    ('classifier', model)
])

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

X = df5['policy_lem_str'].fillna(' ')
y = df5['user_rights']
y = [int(val) for val in y]
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')
X_tfidf = tfidf_vec.fit_transform(X.values.astype('str'))

ROS_pipeline = make_pipeline(tfidf_vec, RandomOverSampler(random_state=777),model)
testing_tfidf = tfidf_vec.fit_transform(X)
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X_tfidf, y)

splits = 5 
Y = pd.Series(y)

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
accuracy = []
precision = []
recall = []
f1 = []
lr_fit, y_pred, scores, y_test = lr_cv(5, X_tfidf, Y, ROS_pipeline, 'macro')

confusion matrix:
[[39  1  3]
 [11  3  7]
 [ 9  0 14]]
              negative    neutral     positive
precision: [0.66 0.75 0.58]
recall:    [0.91 0.14 0.61]
f1 score:  [0.76 0.24 0.6 ]
--------------------------------------------------
confusion matrix:
[[41  1  1]
 [13  0  7]
 [ 8  0 14]]
              negative    neutral     positive
precision: [0.66 0.   0.64]
recall:    [0.95 0.   0.64]
f1 score:  [0.78 0.   0.64]
--------------------------------------------------
confusion matrix:
[[41  0  2]
 [14  1  5]
 [ 3  2 17]]
              negative    neutral     positive
precision: [0.71 0.33 0.71]
recall:    [0.95 0.05 0.77]
f1 score:  [0.81 0.09 0.74]
--------------------------------------------------
confusion matrix:
[[40  1  2]
 [13  1  6]
 [ 3  2 17]]
              negative    neutral     positive
precision: [0.71 0.25 0.68]
recall:    [0.93 0.05 0.77]
f1 score:  [0.81 0.08 0.72]
--------------------------------------------------
confusion matrix:
[[40  1  1]
 [14  0  6]
 [10  0 12

In [87]:
X = df['policy_lem_str']
X_tfidf = tfidf_vec.transform(X.values.astype('str'))
# Use the built model to predict probabilities for new (unscored) policies
#Here, 0 is -1, 1 is 0, and 2 is 1 (can't figure out how to change the labels)
probabilities = lr_fit.predict_proba(X_tfidf)
df['score_user_rights_good_predicted_probability'] = [x[2] for x in probabilities]
df['score_user_rights_bad_predicted_probability'] = [x[0] for x in probabilities]
df['score_user_rights_neutral_predicted_probability'] = [x[1] for x in probabilities]
df_final = pd.merge(df, scores_df, how='left', left_on='domain', right_on='Website Name')
df_final.head()

Unnamed: 0,access,policy,policy_clean,tokens,lemmas,policy_str,domain,policy_lem_str,trash_predicted_probability,score_overall_good_predicted_probability,...,Website Name,Trash Policy,Paragraph Structure,Data Collection,Consent,Responsible Use,Third-Parties,User-Rights,Overall,Comment
0,Google,[This privacy policy (the “Privacy Policy”) de...,[This privacy policy the Privacy Policy descri...,"[[privacy, policy, privacy, policy, describes,...","[[privacy, policy, privacy, policy, describes,...",This privacy policy the Privacy Policy describ...,2u.com,privacy policy privacy policy describes inc u ...,0.313021,0.16,...,,,,,,,,,,
1,Google,"[Please note: By using this website, you will ...",[Please note By using this website you will be...,"[[please, note, using, website, accepting, ter...","[[please, note, using, website, accepting, ter...",Please note By using this website you will be ...,accel.northshoregasdelivery.com,please note using website accepting term use a...,0.615135,0.25,...,accel.northshoregasdelivery.com,1.0,/,/,/,/,/,/,/,
2,Google,"[At Achieve Life Sciences, Inc Inc. (“Achieve“...",[At Achieve Inc Achieve we are firmly committe...,"[[achieve, inc, achieve, firmly, committed, pr...","[[achieve, inc, achieve, firmly, committed, pr...",At Achieve Inc Achieve we are firmly committed...,achievelifesciences.com,achieve inc achieve firmly committed protectin...,0.42026,0.11,...,,,,,,,,,,
3,,[],[],[],[],,acsietf.com/etf-details/,,0.790588,0.01,...,,,,,,,,,,
4,Google,[Fidelity Investments and the Fidelity Funds a...,[Investments and the Funds are committed to ma...,"[[investments, funds, committed, maintaining, ...","[[investment, fund, committed, maintaining, co...",Investments and the Funds are committed to mai...,advisor.fidelity.com,investment fund committed maintaining confiden...,0.357318,0.15,...,,,,,,,,,,


In [88]:
#Export draft
import json
with open('policies_scored.json', 'w') as outfile:
    json.dump(df_final.to_json(), outfile)