In [None]:
!pip install boto3
import boto3
import argparse
import pandas as pd
import random
import string
import time
from datetime import datetime
from tqdm import tqdm
import io
import re

# Creating a batch job

1. Sign up to STRM Privacy
2. Create a schema and corresponding contract or use a publicly available one
3. Create a Sink/Data Connector (we use AWS S3 bucket)
4. Generate (or use real) data
5. Send data to input sink/data connector
6. Run batch job and save to output sink/data connector (possibly same sink as input)
7. Fetch data from sink/data connector and inspect

## 1. Sign up to STRM Privacy
See the [authentication docs](https://docs.strmprivacy.io/docs/latest/quickstart/authentication-cli.html) to sign up to STRM Privacy.

## 2. Create Schema and Contract
See [schemas and contract docs](https://docs.strmprivacy.io/docs/latest/concepts/schemas-and-contracts.html).

## 3. Create Sink/Data Connector

In this tutorial we use an AWS S3 sink. Either use the [console](https://console.strmprivacy.io/sinks) to create a sink or follow the steps in quickstart [docs](https://docs.strmprivacy.io/docs/latest/quickstart/receiving-s3.html).

The `s3.json` should follow this structure:

```json
// s3.json
{ 
    "AccessKey": {
        "UserName": "your-username",
        "AccessKeyId": "***",
        "Status": "Active",
        "SecretAccessKey": "***",
        "CreateDate": "***"
    }
}
```

## 4. Generate data

Let's generate some random user data.

In [4]:
class DataGenerator:
    def __init__(self, nrows):
        self.nrows = nrows
        self.user_name = "strm_demo_user"
        self.session_id = 0
        self.reset_counter_and_session_size()
        
    def get_random_value(self, value_type, field_name, iter):
        if value_type == "STRING":
            return f"{field_name.split(' ')[0]}_" + ''.join(random.choice(string.ascii_letters) for x in range(8))
        elif value_type == "INT":
            return f"{random.randint(0, 1e15)}"
        elif value_type == "FLOAT":
            return f"{random.random()}"
        elif value_type == "USER_NAME":
            return self.user_name
        elif value_type == "SESSION_ID":
            self.counter += 1
            if self.counter > self.max_session:
                self.session_id += 1
                self.reset_counter_and_session_size()
            return f'session_{self.session_id}'
        elif value_type == "TIMESTAMP":
            stamp = time.time() - (1- iter/self.nrows) * (3600 * 50) + 60 * (random.random() - 0.5)
            return datetime.fromtimestamp(stamp).astimezone().strftime('%Y-%m-%d %H.%M.%S.%f:%z')
        elif value_type == "EMAIL":
            name = self.rstring(8)
            host = self.rstring(6)
            return f'{name}@{host}.com'
        elif value_type == "PLANE":
            return random.choice([0,1,2])
    
    def reset_counter_and_session_size(self):
        self.counter = 0
        self.max_session = random.randint(0.01 * self.nrows,.1 * self.nrows)
    
    def rstring(self, n):
      return ''.join(random.choice(string.ascii_lowercase) for x in range(n))
    
    def generate(self): 
      col_names = {
          "SessionId"     : "SESSION_ID",
          "UserName"      : "USER_NAME",
          "Timestamp"     : "TIMESTAMP",
          "Email"         : "EMAIL",
          "PublicFieldA"  : "STRING",
          "PublicFieldB"  : "FLOAT",
          "PrivateFieldA" : "STRING",
          "PrivateFieldB" : "INT",
          "PrivacyPlane"  : "PLANE"
      }

      df = pd.DataFrame(columns=col_names)
      df.to_csv('./databert-demo.csv', index=None)
      
      for i in tqdm(range(int(self.nrows))):
          row = {k:[self.get_random_value(v, k, i)] for k,v in col_names.items()}
          df = pd.DataFrame(data=row)
          df.to_csv('./databert-demo.csv', mode='a', header=False, index=None)
      df = pd.read_csv('./databert-demo.csv').sort_values(by=['Timestamp'])
      df.to_csv('./databert-demo.csv', header=True, index=None)
      return df

In [5]:
user = DataGenerator(20000)
df = user.generate()
df.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [00:10<00:00, 1997.50it/s]


Unnamed: 0,SessionId,UserName,Timestamp,Email,PublicFieldA,PublicFieldB,PrivateFieldA,PrivateFieldB,PrivacyPlane
1,session_0,strm_demo_user,2022-02-05 13.59.32.779310:+0100,xcbezlns@fekajx.com,PublicFieldA_LPGWCNin,0.804185,PrivateFieldA_AyoSqUVG,199091768142917,0
0,session_0,strm_demo_user,2022-02-05 13.59.34.443059:+0100,pstpkkdg@wyvpug.com,PublicFieldA_SWDCLMaN,0.566102,PrivateFieldA_DYMAKcvx,128218529374848,0
2,session_0,strm_demo_user,2022-02-05 13.59.47.948041:+0100,uvqljilu@frvlpb.com,PublicFieldA_MFLuCSXl,0.896083,PrivateFieldA_QGDHaImD,513771886153317,1
3,session_0,strm_demo_user,2022-02-05 13.59.51.224595:+0100,asemajov@elceps.com,PublicFieldA_posKCzkj,0.910711,PrivateFieldA_mCetxtQq,562418582583012,1
4,session_0,strm_demo_user,2022-02-05 14.00.32.982780:+0100,vkysmsas@wiwcaj.com,PublicFieldA_GtCXxKdL,0.025278,PrivateFieldA_YdzkAkXS,275539236044180,0


## 5. Send data to input sink

Send the data to the S3 bucket.

In [9]:
class AwsProperties(object):
    aws_access_key_id = 'AKIARFC2LQDPP75MZQXK'
    aws_secret_access_key = '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'
    region = 'eu-central-1'
    bucket = 'databert' 

In [10]:
AWS = AwsProperties()
s3 = boto3.resource(
      service_name='s3',
      region_name=AWS.region,
      aws_access_key_id=AWS.aws_access_key_id,
      aws_secret_access_key=AWS.aws_secret_access_key
    )

In [11]:
resp = s3.Object('databert', 'databert-demo.csv').put(Body=open('./databert-demo.csv', 'rb'))

## 6. Run batch job
We'll be running a batch job from the CLI. We need to pass the configuration of the batch job as an argument. An example configuration can be found below.


```json
// batch-job.json
{
    "ref": {
      "billing_id": "your_billing_id"
    },
    "source_data": {
      "data_connector_ref": {
        "billing_id": "your_billing_id",
        "name": "databert-demo"
      },
      "file_name": "databert-demo.csv",
      "data_type": {
        "csv": {
          "charset": "UTF-8"
        }
      }
    },
    "consent": {
      "default_consent_levels": [
        0
      ],
      "consent_level_extractor": {
        "field": "PrivacyPlane",
        "field_patterns": {
          "1": {
            "consent_levels": [
              1
            ]
          },
          "2": {
            "consent_levels": [
              2
            ]
          }
        }
      }
    },
    "encryption": {
      "timestamp_config": {
        "field": "Timestamp",
        "format": "yyyy-MM-dd HH.mm.ss.nnnnnn:Z",
        "default_time_zone": {
          "id": "UTC"
        }
      },
      "batch_job_group_id": null
    },
    "event_contract_ref": {
      "handle": "databert-handle",
      "name": "batch_job_public",
      "version": "1.0.1"
    },
    "encrypted_data": {
      "target": {
        "data_connector_ref": {
          "billing_id": "your_billing_id",
          "name": "databert-demo"
        },
        "data_type": {
          "csv": {
            "charset": "UTF-8"
          }
        },
        "file_name": "databert-demo-encrypted.csv"
      }
    },
    "encryption_keys_data": {
      "target": {
        "data_connector_ref": {
          "billing_id": "your_billing_id",
          "name": "databert-demo"
        },
        "data_type": {
          "csv": {
            "charset": "UTF-8"
          }
        },
        "file_name": "databert-demo-encryption-keys.csv"
      }
    },
    "derived_data": [
      {
        "target": {
          "data_connector_ref": {
            "billing_id": "your_billing_id",
            "name": "databert-demo"
          },
          "data_type": {
            "csv": {
              "charset": "UTF-8"
            }
          },
          "file_name": "databert-demo-derived.csv"
        },
        "consent_levels": [
          2
        ],
        "consent_level_type": "CUMULATIVE",
        "masked_fields": {
          "field_patterns": {
            "databert-handle/batch_job_public/1.0.1": {
              "field_patterns": [
                "Email",
                "UserName"
              ]
            }
          }
        }
      }
    ]
  }
```

Now call the batch job: `strm create batch-job -F batch-job.json`

Wait for the job to finish. Status can be checked via `strm list batch-jobs`


## 7. Fetch and explore Data

Now we fetch the encrypted and derived data from the sink/data connector.
We expect from our data contract that the pii-fields `Email`, `PrivateFieldA` and `PrivateFieldB` are encrypted in the encrypted file.
We expect that the fields `Email` and `UserName` are masked with a hash in the derived file. Let's investigate: 

In [12]:
# Get objects from bucket
encrypted = s3.Object('databert', 'databert-demo-encrypted.csv').get()
df_encrypted = pd.read_csv(io.BytesIO(encrypted['Body'].read()))

encryption_keys = s3.Object('databert', 'databert-demo-encryption-keys.csv').get()
df_encryption_keys = pd.read_csv(io.BytesIO(encryption_keys['Body'].read()))

derived = s3.Object('databert', 'databert-demo-derived.csv').get()
df_derived = pd.read_csv(io.BytesIO(derived['Body'].read()))

In [13]:
df_encrypted.head()

Unnamed: 0,SessionId,UserName,Email,PublicFieldA,PublicFieldB,PrivateFieldA,PrivateFieldB,PrivacyPlane,strmMeta.eventContractRef,strmMeta.nonce,strmMeta.timestamp,strmMeta.keyLink,strmMeta.billingId,strmMeta.consentLevels
0,session_0,strm_demo_user,AXIUbs/NN+MB3lA4O9/B9fx6CkblrOjnvPGBd9tka0+c5S...,PublicFieldA_LPGWCNin,0.804185,AXIUbs+RbpeRxAaeFnX7tU/gYCM57WxBRHxRpus+e2Dezk...,AXIUbs/opxo2h/uTDA6Rz6GN/0nGb3bLbTPW/+zujFLpn0rv,0,databert-handle/batch_job_public/1.0.0,0,1644065972000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,0
1,session_0,strm_demo_user,AXIUbs9g0X30+c2W8EGnYyzyXswjMY5qtsiuvPIgGTHUac...,PublicFieldA_SWDCLMaN,0.566102,AXIUbs8xoIEqG2rJg1nJ0f5VBD38fEJVTaLAjbzLEJWkhV...,AXIUbs9citWdNJAgG8Uyb0Sa4pyD+GW9546q+ayb/z9RIXw3,0,databert-handle/batch_job_public/1.0.0,0,1644065974000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,0
2,session_0,strm_demo_user,AXIUbs8bVj+qh48/ZedZ2Cupmll3J5+fTpUSMrhWj29+sx...,PublicFieldA_MFLuCSXl,0.896083,AXIUbs+BD5b5IS+MCO3vHzT0d8lpHfTe4OJUdfsi6Y76KN...,AXIUbs/N5++yFOf+rw8aU6m40lcjaTSeovJihkieviB6rmLM,1,databert-handle/batch_job_public/1.0.0,0,1644065987000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,1
3,session_0,strm_demo_user,AXIUbs9ntyXmpun4aelHTOepDzoa6ylLhcN3wrKdMtmF2I...,PublicFieldA_posKCzkj,0.910711,AXIUbs8SwmQ0tDCj63Ue1n43mcQT/eHFkEpKQtRDii9SRZ...,AXIUbs+RJBh9ds6KLAJM7NT1qRiTieLKYfbnE4sJoawAsL54,1,databert-handle/batch_job_public/1.0.0,0,1644065991000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,1
4,session_0,strm_demo_user,AXIUbs8m4kF+grEzRx+83Htewfw1xOL2qYU6GWxxI7Xh08...,PublicFieldA_GtCXxKdL,0.025278,AXIUbs8Mnf8hlOa+UpU5dsalnGhX1UF33OuXzXT8Wv4gny...,AXIUbs/MN/RfupCHIbIyGrXyKZ942jSpDcPge4pXhzdeL8d7,0,databert-handle/batch_job_public/1.0.0,0,1644066032000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,0


In [16]:
df_derived

Unnamed: 0,SessionId,UserName,Email,PublicFieldA,PublicFieldB,PrivateFieldA,PrivateFieldB,PrivacyPlane,strmMeta.eventContractRef,strmMeta.nonce,strmMeta.timestamp,strmMeta.keyLink,strmMeta.billingId,strmMeta.consentLevels
0,session_0,strm_demo_user,AXIUbs/4G1cTcF2y9Ei5vAeuj6wGssK10Om3fuLhVKY5Fu...,PublicFieldA_FFJuNzqx,0.087665,PrivateFieldA_jaIDXTFy,423508964910206,2,databert-handle/batch_job_public/1.0.0,0,1644066059000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,2
1,session_0,strm_demo_user,AXIUbs+z4BMLi0Yu3KAA0VtOXwq9qfSEUBghmkUSaVgPZ4...,PublicFieldA_bjyGuMpx,0.658154,PrivateFieldA_fCzdiwvf,837835875046112,2,databert-handle/batch_job_public/1.0.0,0,1644066094000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,2
2,session_0,strm_demo_user,AXIUbs/DIVssNfrzZv8blLhFOdpASjUjZUq6sBMCFEtnOt...,PublicFieldA_mkijhIqy,0.593678,PrivateFieldA_UEGCnVVc,392355306415087,2,databert-handle/batch_job_public/1.0.0,0,1644066141000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,2
3,session_0,strm_demo_user,AXIUbs/gD08QjYIPoqgDDGw6M/BsQoJosQufs/bwpZ50Jp...,PublicFieldA_PXcxOFVV,0.267614,PrivateFieldA_KVOmyQzF,287960266516685,2,databert-handle/batch_job_public/1.0.0,0,1644066147000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,2
4,session_0,strm_demo_user,AXIUbs/TMXkT+qrzDlA6drWGuoD9GiwDOZ/MvAqy4iSgIT...,PublicFieldA_ccHDTKeo,0.744933,PrivateFieldA_JrsjOPjm,990530793375827,2,databert-handle/batch_job_public/1.0.0,0,1644066149000,d36a04fb-6128-4f25-9e54-828519100166,databert986673817,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6749,session_19,strm_demo_user,AWJeUpJoBlGfZ9AHpQLOZ0J6HKAsEBedAhdIS4NaA9P+z3...,PublicFieldA_YRbOnOwH,0.450321,PrivateFieldA_MMRAGjFB,821768305050726,2,databert-handle/batch_job_public/1.0.0,0,1644245857000,a6bed6cb-c03e-4c82-8979-e10df0f56512,databert986673817,2
6750,session_19,strm_demo_user,AWJeUpLd99qT6gzu+JecUZhFbDwLmJm4UntUC/45BNfMrV...,PublicFieldA_kJrqOuhe,0.966509,PrivateFieldA_JfmUTphq,505757036894227,2,databert-handle/batch_job_public/1.0.0,0,1644245880000,a6bed6cb-c03e-4c82-8979-e10df0f56512,databert986673817,2
6751,session_19,strm_demo_user,AWJeUpIfnHpwmS3EIY9x0eoGZRjEIG+HviiCGY7AVdk6fb...,PublicFieldA_qcWaOFDT,0.364296,PrivateFieldA_pirFSXFw,471854128182554,2,databert-handle/batch_job_public/1.0.0,0,1644245943000,a6bed6cb-c03e-4c82-8979-e10df0f56512,databert986673817,2
6752,session_19,strm_demo_user,AWJeUpJbD8Wxpi9KlCeR8xE/dzj+VBlfQ6TMIk83OklI16...,PublicFieldA_xkCvFtRw,0.582587,PrivateFieldA_klpkjpZa,187787776289616,2,databert-handle/batch_job_public/1.0.0,0,1644245983000,a6bed6cb-c03e-4c82-8979-e10df0f56512,databert986673817,2


In [15]:
df_encryption_keys

Unnamed: 0,keyLink,encryptionKey
0,d36a04fb-6128-4f25-9e54-828519100166,"{""primaryKeyId"":1913941711,""key"":[{""keyData"":{..."
1,1655b4ae-a2a5-43e7-b4f6-09e2d26d5350,"{""primaryKeyId"":999156106,""key"":[{""keyData"":{""..."
2,a6bed6cb-c03e-4c82-8979-e10df0f56512,"{""primaryKeyId"":1650348690,""key"":[{""keyData"":{..."
