<a href="https://colab.research.google.com/github/satisdrive/privacy_grade_predictor/blob/main/tosdr_privacy_grade.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install ktrain
!pip3 install ktrain
import ktrain
from ktrain import text

In [20]:
# fetch training and validate datasets
import pandas as pd

train_url='https://raw.githubusercontent.com/satisdrive/privacy_grade_predictor/main/data/train.csv'
val_url='https://raw.githubusercontent.com/satisdrive/privacy_grade_predictor/main/data/val.csv'
train_df = pd.read_csv(train_url)
val_df = pd.read_csv(val_url)

In [None]:
# preprocess data, define model, and train classification model
import numpy as np

MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=['bad','blocker','good','neutral'])
trn = t.preprocess_train(np.array(train_df['text']), np.array(train_df['point']))
val = t.preprocess_train(np.array(val_df['text']), np.array(val_df['point']))

model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
learner.fit_onecycle(5e-5, 5)

In [None]:
# evaluate model
learner.validate(class_names=t.get_classes())

# get predictor
predictor = ktrain.get_predictor(learner.model, t)

In [27]:
# fetch data to score for privacy grade
import base64
import requests

def cleanse(line):
    import re
    return re.sub(" +"," ",re.sub("[^a-zA-Z0-9 .-]", '', re.sub("[\r\n]", ' ', str(line).lower())))

# set the url below as needed
tos_url = 'https://raw.githubusercontent.com/github/site-policy/main/Policies/github-privacy-statement.md'
req = requests.get(tos_url)
req = cleanse(req.text)

In [None]:
# determine privacy grade
grade_count = {}
lines = req.split('.')
for l in lines: # todo: filter on length.
  grade = predictor.predict(l)
  try:
    grade_count[grade] += 1
  except KeyError:
    grade_count[grade] = 1
avg_grade = (grade_count['bad'] + grade_count['neutral']*2 + grade_count['good']*3)/len(lines)
# todo: the weights for each category should be learnt from the training examples
print('privacy_grade: {0}'.format('E' if avg_grade < 0.6 else ('D' if avg_grade < 1.2 else ('C' if avg_grade < 1.8 else ('B' if avg_grade < 2.4 else 'A')))))
