## Terms of Service / Privacy Policy Classification Tool

Instructions...

In [54]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import requests
from readability import Document
import re

In [55]:
data = pd.read_csv('topics', header = 'infer')
data = pd.read_csv('classes', header = 'infer')

In [56]:
tokenizer = RegexpTokenizer(r'\w+')
data['tokens'] = data['quoteText'].apply(tokenizer.tokenize)

lemmatizer = WordNetLemmatizer()

def lemmatize_block(cell):
    lemma_words = []
    for word in cell:
        lemma_words.append(lemmatizer.lemmatize((word.lower())))
    return " ".join(lemma_words)

data['lemmatized'] = data.tokens.apply(lemmatize_block)

In [57]:
X = data[['lemmatized']]
y = data['topics']
X_class = ratingdf[['lemmatized']]
y_class = ratingdf['point_other']

tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X['lemmatized'])
X_tfidf = tfidf.transform(X['lemmatized'])

tfidf_class = TfidfVectorizer(stop_words='english')
tfidf_class.fit(X_class['lemmatized'])
X_tfidf_class = tfidf_class.transform(X_class['lemmatized'])

In [61]:
sgdc_binary = SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)
sgdc_binary.fit(X_tfidf_class, y_class)

lr_tfidf = LogisticRegression(C=100.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial',
                          random_state=7)
lr_tfidf.fit(X_tfidf, y)

LogisticRegression(C=100.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=None, penalty='l2',
          random_state=7, solver='newton-cg', tol=0.0001, verbose=0,
          warm_start=False)

In [62]:
dd = {'Waiving your right': 'You might be giving up some of your consumer or legal rights',
      'Business Transfers': 'Your data might be a transferable business asset',
     'Changes to Terms': 'You may not be notified in the event of changes, or those changes might be unfavourable',
     'Anonymity and Tracking':'The service may be tracking you in unexpected ways',
      'Content':'If the service allows you to upload content, you may wish to check these terms to see how it may be used.',
      'Cookies':'Cookies are files stored locally in your web browser containing identifiable information. This service may use them in ways you might not like.',
      'Governance':'Your relationship with the service and the community: this is a broad spectrum but there may be concerning terms.',
      'Guarantee':'Your guarantees may be limited.',
      'Jurisdiction and governing laws':'Possible concern over the governing law for this service.',
      'Law and government requests':'This service may not behave favourably or with transparency towards you, if government requests are received.',
      'Logs':'The service may keep a significant record of your activity, or be less-than-transparent about it.',
      'Ownership':'The content and the data you generate on services online is usually subject to copyright law: you may wish to check the service has the same view',
      'Personal Data':'Your personal data may be used in unexpected ways and you may not have control of it once given.',
      'Right to leave the service':'It may be difficult to leave this service or delete your data.',
      'Scope of the copyright licence':'The copyright license may be so broad that your content can be exploited by others without asking you',
      'Suspension and Censorship':'The service may be suspended or your lawful content removed without warning or recourse.',
      'User information':'Terms that may contain noteworthy information',
      'Third Parties':'Third parties may be involved in operating the service or otherwise be in receipt of your data',
      'User choice':'These terms may give you some choice, or an illusion of choice'}

In [63]:
TAG_RE = re.compile(r'<[^>][^>]+>')

def remove_tags(text):
    return TAG_RE.sub(' ', text)

def key_terms(url):
    response = requests.get(url)
    doc = Document(response.text)
    full_text = doc.summary(html_partial=True)
    full_text = full_text.replace(r"\n", " ")
    full_text = full_text.replace(r"\t", " ")
    full_text = full_text.replace(r"/", " ")
    full_text = remove_tags(full_text)
    term_list = full_text.split('<p>')
    term_list_check = []
    for i in term_list:
        if len(i) > 50:
            term_list_check.append(i)
    term_frame = pd.DataFrame(term_list_check, columns = ['quoteText'])
    term_frame['tokens'] = term_frame['quoteText'].apply(tokenizer.tokenize)
    term_frame['lemmatized'] = term_frame.tokens.apply(lemmatize_block)
    X_world_tfidf = tfidf.transform(term_frame['lemmatized'])
#     lr_tfidf.fit(X_train_tfidf, y_train_topic)
    world_preds = lr_tfidf.predict(X_world_tfidf)
    world_topic = pd.DataFrame(world_preds, columns =['pred_topic'])
    X_world_tfidf_class = tfidf_class.transform(term_frame['lemmatized'])
#     sgdc_binary.fit(X_train_tfidf_class, y_train_class)
    Y_world_pp = pd.DataFrame(sgdc_binary.predict_proba(X_world_tfidf_class), columns=['warning_pp','neutral_pp'])
    scrape_results = pd.merge(term_frame, world_topic, left_index=True, right_index=True)
    scrape_res_final = pd.merge(scrape_results, Y_world_pp, left_index=True, right_index=True)
    res = scrape_res_final[scrape_res_final.warning_pp > 0.5].sort_values('neutral_pp')
    noct = len(res)
    res = res.head(10)
    print("Thanks for using the Terms Classifier. The title of the analysed extract is:")
    print('"'+doc.title()+'"')
    print()
    topics_in_this_extract = []
    for i in res['pred_topic']:
        if i not in topics_in_this_extract:
            topics_in_this_extract.append(i)
    if len(topics_in_this_extract) == 0:
        print('No concerning terms were found in this extract!')
    else:
        print("The terms have been analysed! You might want to pay attention to the following:")
        print()
        for i in topics_in_this_extract:
            topic = res[res['pred_topic'] == i]
            print(i)
            print(dd[i])
            count = 0
            for j in topic.index:
                count +=1
                print()
                print(count,'. ', topic.quoteText[j])
                print()
            print('---')
            print()
        if noct > 10:
            print('There were an additional',noct-10,'concerning terms found in addition to those above.')

# Start Here to try the Classifier

Choose a service from the list below, or use your own URL.
Enter it between the parentheses of *key_terms* and **run** the code!

In [64]:
forghetti = 'https://www.forghetti.com/eng/terms-of-service'
neko_atsume = 'http://nekoatsume.com/en/kiyaku.html'
stardew_valley = 'https://www.stardewvalley.net/terms/'
pinterest ='https://policy.pinterest.com/en-gb/terms-of-service'
instagram = 'https://help.instagram.com/478745558852511/'

In [68]:
url = input('Enter a URL here: ')
print()
key_terms(url)

Enter a URL here: https://www.amazon.co.uk/gp/feature.html?docId=1000700003

Thanks for using the Terms Classifier. The title of the analysed extract is:
"Amazon.co.uk: Terms and Conditions"

The terms have been analysed! You might want to pay attention to the following:

Anonymity and Tracking
The service may be tracking you in unexpected ways

1 .  4. The promotional credit cannot to be used in conjunction with any other offer. Promotion credits may not be used on any existing orders.
 


2 .  5. If you cancel your order for a Qualifying Item, Offer ceases to apply and you will not receive your promotional credit. If you return a Qualifying Item, Amazon.co.uk reserves the right to withdraw your promotional credit and or charge you (using the payment method you used for the original order) for the value of the promotional credit or part thereof. 
 


3 .  3. Amazon.co.uk will use all reasonable endeavours to give you your promotional credit by the time stated in the promotion.  Howeve