In [None]:
!pip install boto3
import boto3
import argparse
import pandas as pd
import random
import string
import time
from datetime import datetime
from tqdm import tqdm
import io
import re

Collecting boto3
  Downloading boto3-1.20.49-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 4.3 MB/s 
[?25hCollecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.1-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 4.5 MB/s 
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting botocore<1.24.0,>=1.23.49
  Downloading botocore-1.23.49-py3-none-any.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 35.3 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.8-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 60.8 MB/s 
Installing collected packages: urllib3, jmespath, botocore, s3transfer, boto3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
[31mERROR: pip's dependency resolver does not currently take into a

# Creating a batch job

1. Sign up to STRM Privacy
2. Create a schema and corresponding contract or use a publicly available one
3. Create a Sink/Data Connector (we use AWS S3 bucket)
4. Generate (or use real) data
5. Send data to input sink/data connector
6. Run batch job and save to output sink/data connector (possibly same sink as input)
7. Fetch data from sink/data connector and inspect

## 1. Sign up to STRM Privacy
See the [authentication docs](https://docs.strmprivacy.io/docs/latest/quickstart/authentication-cli.html) to sign up to STRM Privacy.

## 2. Create Schema and Contract
See [schemas and contract docs](https://docs.strmprivacy.io/docs/latest/concepts/schemas-and-contracts.html).

## 3. Create Sink/Data Connector

In this tutorial we use an AWS S3 sink. Either use the [console](https://console.strmprivacy.io/sinks) to create a sink or follow the steps in quickstart [docs](https://docs.strmprivacy.io/docs/latest/quickstart/receiving-s3.html).

The `s3.json` should follow this structure:

```json
// s3.json
{ 
    "AccessKey": {
        "UserName": "your-username",
        "AccessKeyId": "***",
        "Status": "Active",
        "SecretAccessKey": "***",
        "CreateDate": "***"
    }
}
```

## 4. Generate data

Let's generate some random user data.

In [None]:
class DataGenerator:
    def __init__(self, nrows):
        self.nrows = nrows
        self.user_name = "strm_demo_user"
        self.session_id = 0
        self.reset_counter_and_session_size()
        
    def get_random_value(self, value_type, field_name, iter):
        if value_type == "STRING":
            return f"{field_name.split(' ')[0]}_" + ''.join(random.choice(string.ascii_letters) for x in range(8))
        elif value_type == "INT":
            return f"{random.randint(0, 1e15)}"
        elif value_type == "FLOAT":
            return f"{random.random()}"
        elif value_type == "USER_NAME":
            return self.user_name
        elif value_type == "SESSION_ID":
            self.counter += 1
            if self.counter > self.max_session:
                self.session_id += 1
                self.reset_counter_and_session_size()
            return f'session_{self.session_id}'
        elif value_type == "TIMESTAMP":
            stamp = time.time() - (1- iter/self.nrows) * (3600 * 50) + 60 * (random.random() - 0.5)
            return datetime.fromtimestamp(stamp).astimezone().strftime('%Y-%m-%d %H.%M.%S.%f:%z')
        elif value_type == "EMAIL":
            name = self.rstring(8)
            host = self.rstring(6)
            return f'{name}@{host}.com'
        elif value_type == "PLANE":
            return random.choice([0,1,2])
            # return re.sub("[\[\] ]","", f"{list(set(sorted(random.choices((0,1,2), k=random.randint(0,8)))))}")
    
    def reset_counter_and_session_size(self):
        self.counter = 0
        self.max_session = random.randint(0.01 * self.nrows,.1 * self.nrows)
    
    def rstring(self, n):
      return ''.join(random.choice(string.ascii_lowercase) for x in range(n))
    
    def generate(self): 
      col_names = {
          "SessionId"     : "SESSION_ID",
          "UserName"      : "USER_NAME",
          "Timestamp"     : "TIMESTAMP",
          "Email"         : "EMAIL",
          "PublicFieldA"  : "STRING",
          "PublicFieldB"  : "FLOAT",
          "PrivateFieldA" : "STRING",
          "PrivateFieldB" : "INT",
          "PrivacyPlane"  : "PLANE"
      }

      df = pd.DataFrame(columns=col_names)
      df.to_csv('./databert-demo.csv', index=None)
      
      for i in tqdm(range(int(self.nrows))):
          row = {k:[self.get_random_value(v, k, i)] for k,v in col_names.items()}
          df = pd.DataFrame(data=row)
          df.to_csv('./databert-demo.csv', mode='a', header=False, index=None)
      df = pd.read_csv('./databert-demo.csv').sort_values(by=['Timestamp'])
      df.to_csv('./databert-demo.csv', header=True, index=None)
      return df

In [None]:
user = DataGenerator(20000)
df = user.generate()
df.head()

100%|██████████| 20000/20000 [00:35<00:00, 564.08it/s]


Unnamed: 0,SessionId,UserName,Timestamp,Email,PublicFieldA,PublicFieldB,PrivateFieldA,PrivateFieldB,PrivacyPlane
0,session_0,strm_user,2022-02-05 11.15.31.558734:+0000,zpledvhl@lwegsu.com,PublicFieldA_eNQyFxsR,0.912026,PrivateFieldA_MLeQwkKn,252705935497292,0
1,session_0,strm_user,2022-02-05 11.15.59.413267:+0000,eenwzpne@wpirvx.com,PublicFieldA_NiJfQrrL,0.296729,PrivateFieldA_zVOhKepR,340516559966918,2
3,session_0,strm_user,2022-02-05 11.16.20.393886:+0000,mckbqbil@olnqrq.com,PublicFieldA_aphRAovH,0.600066,PrivateFieldA_GuQubSaq,945649777916263,2
5,session_0,strm_user,2022-02-05 11.16.25.502314:+0000,wdtqavos@zczadv.com,PublicFieldA_jYMtbLFH,0.015906,PrivateFieldA_waWWEjiZ,72009725662692,1
6,session_0,strm_user,2022-02-05 11.16.26.260598:+0000,xmnzirth@fuvokk.com,PublicFieldA_EAYzNdCB,0.04761,PrivateFieldA_wuKLZNrQ,848730620400744,1


## 5. Send data to input sink

Send the data to the S3 bucket.

In [None]:
class AwsProperties(object):
    aws_access_key_id = '***'
    aws_secret_access_key = '***'
    region = 'eu-central-1'
    bucket = 'databert' 

In [None]:
AWS = AwsProperties()
s3 = boto3.resource(
      service_name='s3',
      region_name=AWS.region,
      aws_access_key_id=AWS.aws_access_key_id,
      aws_secret_access_key=AWS.aws_secret_access_key
    )

In [None]:
resp = s3.Object('databert', 'databert-demo.csv').put(Body=open('./databert-demo.csv', 'rb'))

## 6. Run batch job
We'll be running a batch job from the CLI. We need to pass the configuration of the batch job as an argument. An example configuration can be found below.


```json
// batch-job.json
{
    "ref": {
      "billing_id": "your_billing_id"
    },
    "source_data": {
      "data_connector_ref": {
        "billing_id": "your_billing_id",
        "name": "databert-demo"
      },
      "file_name": "databert-demo.csv",
      "data_type": {
        "csv": {
          "charset": "UTF-8"
        }
      }
    },
    "consent": {
      "default_consent_levels": [
        0
      ],
      "consent_level_extractor": {
        "field": "PrivacyPlane",
        "field_patterns": {
          "1": {
            "consent_levels": [
              1
            ]
          },
          "2": {
            "consent_levels": [
              2
            ]
          }
        }
      }
    },
    "encryption": {
      "timestamp_config": {
        "field": "Timestamp",
        "format": "yyyy-MM-dd HH.mm.ss.nnnnnn:Z",
        "default_time_zone": {
          "id": "UTC"
        }
      },
      "batch_job_group_id": null
    },
    "event_contract_ref": {
      "handle": "databert-handle",
      "name": "batch_job_public",
      "version": "1.0.1"
    },
    "encrypted_data": {
      "target": {
        "data_connector_ref": {
          "billing_id": "your_billing_id",
          "name": "databert-demo"
        },
        "data_type": {
          "csv": {
            "charset": "UTF-8"
          }
        },
        "file_name": "databert-demo-encrypted.csv"
      }
    },
    "encryption_keys_data": {
      "target": {
        "data_connector_ref": {
          "billing_id": "your_billing_id",
          "name": "databert-demo"
        },
        "data_type": {
          "csv": {
            "charset": "UTF-8"
          }
        },
        "file_name": "databert-demo-encryption-keys.csv"
      }
    },
    "derived_data": [
      {
        "target": {
          "data_connector_ref": {
            "billing_id": "your_billing_id",
            "name": "databert-demo"
          },
          "data_type": {
            "csv": {
              "charset": "UTF-8"
            }
          },
          "file_name": "databert-demo-derived.csv"
        },
        "consent_levels": [
          2
        ],
        "consent_level_type": "CUMULATIVE",
        "masked_fields": {
          "field_patterns": {
            "databert-handle/batch_job_public/1.0.1": {
              "field_patterns": [
                "Email",
                "UserName"
              ]
            }
          }
        }
      }
    ]
  }
```

Now call the batch job: `strm create batch-job -F batch-job.json`

Wait for the job to finish. Status can be checked via `strm list batch-jobs`


## 7. Fetch and explore Data

Now we fetch the encrypted and derived data from the sink/data connector.
We expect from our data contract that the pii-fields `Email`, `PrivateFieldA` and `PrivateFieldB` are encrypted in the encrypted file.
We expect that the fields `Email` and `UserName` are masked with a hash in the derived file. Let's investigate: 

In [None]:
# Get objects from bucket
encrypted = s3.Object('databert', 'databert-demo-encrypted.csv').get()
df_encrypted = pd.read_csv(io.BytesIO(encrypted['Body'].read()))

encryption_keys = s3.Object('databert', 'databert-demo-encryption-keys.csv').get()
df_encryption_keys = pd.read_csv(io.BytesIO(encryption_keys['Body'].read()))

derived = s3.Object('databert', 'databert-demo-derived.csv').get()
df_derived = pd.read_csv(io.BytesIO(derived['Body'].read()))

In [None]:
df_encrypted.head()

Unnamed: 0,SessionId,UserName,Email,PublicFieldA,PublicFieldB,PrivateFieldA,PrivateFieldB,PrivacyPlane,strmMeta.eventContractRef,strmMeta.nonce,strmMeta.timestamp,strmMeta.keyLink,strmMeta.billingId,strmMeta.consentLevels
0,session_0,tafelpoot,ASlvIx099aQSAovxPy3MOqMrwl7bxISI+EKZR2Z4PuSm5d...,PublicFieldA_lDOinTVf,0.836677,ASlvIx1DhSXJwyxnSBtXys+1FQ+zcsuUB0QD6Zk+sB8+HI...,ASlvIx3Fl39IcfWFvIFudL3ierWQ4l7QCmCyS+0WE2qb8UYa,1.0,databert-handle/batch_job/1.0.0,0,1644048666000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,1
1,session_0,tafelpoot,ASlvIx2Eb75xebbtB93T01eK+1EiBqu2ES+lYHP6d6EenA...,PublicFieldA_PMiMepXT,0.75986,ASlvIx1jPLTPemIpaF4+IWLGrKIjtYP/kW0UrdQqv28Asz...,ASlvIx2fx+OT0LlphYrsos0aa8KdhWgEvWj93Btz9hCTHdJ1,12.0,databert-handle/batch_job/1.0.0,0,1644049600000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,2
2,session_0,tafelpoot,ASlvIx3FKjSgPkZ9g/UfgtLaP6BDCh7WX6evHE/J+LVKzM...,PublicFieldA_bJmDkfba,0.358066,ASlvIx3s18h5mEMpK2kdy6mvv2iQ/f9/sEnHBtAm5oxkgv...,ASlvIx0RuhayG3DdKLf/9tLXSzg9DmZ7Cr/iDvds5W22jzAa,1.0,databert-handle/batch_job/1.0.0,0,1644050481000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,1
3,session_0,tafelpoot,ASlvIx2ooaC3nk6eTAaOTKsBKZYhST62uNIuBJO7o190r2...,PublicFieldA_jLXsNGyg,0.537864,ASlvIx3gUQnEVhmTb15i+UnglPlKUVg9BBQysKOgGy45XI...,ASlvIx36KsZCUw91nVWZQ+3bkX3fnusW0bfADwE8gthYdAQ=,2.0,databert-handle/batch_job/1.0.0,0,1644051379000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,2
4,session_0,tafelpoot,ASlvIx1fWBra0jPWx95FwJEHbnMj7AJ+qgw9Cb2vCs0d9I...,PublicFieldA_mxcHksag,0.528041,ASlvIx3kAPlHCoNfUYbQNJf7hLheu9MOWA77lIa8fGt2i6...,ASlvIx0KjDC/9IurDXONc7/HNUwItXJ221mYrwxV00w96x1c,,databert-handle/batch_job/1.0.0,0,1644052308000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,0


In [None]:
df_derived

Unnamed: 0,SessionId,UserName,Email,PublicFieldA,PublicFieldB,PrivateFieldA,PrivateFieldB,PrivacyPlane,strmMeta.eventContractRef,strmMeta.nonce,strmMeta.timestamp,strmMeta.keyLink,strmMeta.billingId,strmMeta.consentLevels
0,session_0,3a68fcf2cb9904b88c063026de8acee8,91090ff80fb6f591c19a01e80d85fb0a,PublicFieldA_PMiMepXT,0.759860,PrivateFieldA_qqFkBjLj,259893089738525,12,databert-handle/batch_job/1.0.0,0,1644049600000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,2
1,session_0,3a68fcf2cb9904b88c063026de8acee8,ff74c576924b987db146129bbf968437,PublicFieldA_jLXsNGyg,0.537864,PrivateFieldA_nJlhAcMG,76465180845175,02,databert-handle/batch_job/1.0.0,0,1644051379000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,2
2,session_0,3a68fcf2cb9904b88c063026de8acee8,90be814647c7a56d6c09e639b4e8d9a4,PublicFieldA_tlZTItYj,0.323598,PrivateFieldA_RzUgGFjh,617548905161487,12,databert-handle/batch_job/1.0.0,0,1644053178000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,2
3,session_0,3a68fcf2cb9904b88c063026de8acee8,699687f93d93eeebfacf8a2c3a8a65b4,PublicFieldA_SOkrNvGS,0.783474,PrivateFieldA_GBrsyyac,411930978653158,2,databert-handle/batch_job/1.0.0,0,1644054088000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,2
4,session_1,3a68fcf2cb9904b88c063026de8acee8,7bf859456d7db9331a39d1197c765314,PublicFieldA_DMroVJBC,0.777622,PrivateFieldA_CrXlEdgm,985974082249043,02,databert-handle/batch_job/1.0.0,0,1644054969000,a71b2c65-1833-4af3-b13c-861617a8041c,databert986673817,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,session_18,3a68fcf2cb9904b88c063026de8acee8,bc86d86f4ed3df68d7ee441ceb250676,PublicFieldA_StsEGRel,0.536219,PrivateFieldA_feuWOdFh,304861624282817,012,databert-handle/batch_job/1.0.0,0,1644223271000,895ff499-b32e-4b2c-9d02-c43c91ff0aa3,databert986673817,2
127,session_18,3a68fcf2cb9904b88c063026de8acee8,6181fe8fd5bd0ddcebdda11ad3b4ae2a,PublicFieldA_BZvdwOLb,0.264669,PrivateFieldA_KTKrialo,376743263889652,12,databert-handle/batch_job/1.0.0,0,1644224183000,895ff499-b32e-4b2c-9d02-c43c91ff0aa3,databert986673817,2
128,session_18,3a68fcf2cb9904b88c063026de8acee8,a3d2cd687b5632a16cb2785010faa88f,PublicFieldA_McOxyZMh,0.446433,PrivateFieldA_lrBEoSIz,637789140165323,012,databert-handle/batch_job/1.0.0,0,1644225989000,895ff499-b32e-4b2c-9d02-c43c91ff0aa3,databert986673817,2
129,session_18,3a68fcf2cb9904b88c063026de8acee8,a9d78bbf95c950e8c89ffca19672f9d4,PublicFieldA_DslELFQG,0.811559,PrivateFieldA_bUhhsSif,334100066662549,02,databert-handle/batch_job/1.0.0,0,1644226880000,895ff499-b32e-4b2c-9d02-c43c91ff0aa3,databert986673817,2


In [None]:
df_encryption_keys

Unnamed: 0,keyLink,encryptionKey
0,694663cf-a9ff-4887-a3ad-38e7a99f5dea,"{""primaryKeyId"":1860054122,""key"":[{""keyData"":{..."
1,5e8a2ec3-747c-4638-a949-13c212134ac7,"{""primaryKeyId"":129621179,""key"":[{""keyData"":{""..."
2,55e1e012-0b70-4dc9-9893-9df87629b6ec,"{""primaryKeyId"":1945956772,""key"":[{""keyData"":{..."
