In [1]:
import boto3
import argparse
import pandas as pd
import random
import string
import time
from datetime import datetime
from tqdm import tqdm
import io
import math
import re

# Creating a batch job

1. Sign up to STRM Privacy
2. Create a schema and corresponding contract or use a publicly available one
3. Create a Data Connector (we use an AWS S3 bucket)
4. Generate (or use real) data
5. Send data to the input data connector
6. Run batch job and write to output data connector (possibly same bucket as input)
7. Fetch data from data connector and inspect

## 1. Sign up to STRM Privacy
See the [authentication docs](https://docs.strmprivacy.io/docs/latest/quickstart/authentication-cli.html) to sign up to STRM Privacy.

## 2. Create Schema and Contract
See [schemas and contract docs](https://docs.strmprivacy.io/docs/latest/concepts/schemas-and-contracts.html).

## 3. Create Data Connector

In this tutorial we use an AWS S3 bucket. Either use the [console](https://console.strmprivacy.io/data-connectors) to create a data connector for the bucket, or follow the steps from the quickstart [docs](https://docs.strmprivacy.io/docs/latest/quickstart/batch-exporter/).

The `s3.json` should follow this structure:

```json
{ 
    "AccessKey": {
        "UserName": "your-username",
        "AccessKeyId": "***",
        "Status": "Active",
        "SecretAccessKey": "***",
        "CreateDate": "***"
    }
}
```

## 4. Generate data

We have generated some sample data. There are three pii-fields: `Email`, `PrivateFieldA` and `PrivateFieldB`.

In [2]:
df = pd.read_csv('./data/input.csv')
df

Unnamed: 0,SessionId,UserName,Timestamp,Email,PublicFieldA,PublicFieldB,PrivateFieldA,PrivateFieldB,PrivacyPlane
0,session_1,strm_demo_user,2022-02-06 06.43.12.962083:+0100,ibthtfnf@mbeito.com,PublicFieldA_ZIyrjcBj,0.020183,PrivateFieldA_kHmSNlAn,279909288129237,2
1,session_1,strm_demo_user,2022-02-06 11.43.36.340336:+0100,ifutwmxu@sllcja.com,PublicFieldA_pBznLzVx,0.261294,PrivateFieldA_EwVAJxxF,304221592396358,2
2,session_2,strm_demo_user,2022-02-06 16.43.10.365912:+0100,ilmcedhf@sxdsyh.com,PublicFieldA_SyVIWPEH,0.632461,PrivateFieldA_tLwMlNoz,384570554091913,1
3,session_3,strm_demo_user,2022-02-06 21.43.02.232553:+0100,vjmorzkn@pfsmul.com,PublicFieldA_qvUYaCPR,0.92091,PrivateFieldA_yRxNLMRa,66897108978672,0
4,session_3,strm_demo_user,2022-02-07 02.43.10.174305:+0100,sxsoyvkd@gtggzt.com,PublicFieldA_oBodytpb,0.802284,PrivateFieldA_jDwasBFI,668450493294117,0
5,session_4,strm_demo_user,2022-02-07 07.43.06.975069:+0100,sxpmqguk@pvrazs.com,PublicFieldA_eaPWZEao,0.701134,PrivateFieldA_UrotTNQH,488952946713404,1
6,session_5,strm_demo_user,2022-02-07 12.43.10.145600:+0100,dhrmhyzd@arhrpv.com,PublicFieldA_ZEiDwfZH,0.681264,PrivateFieldA_ZxZuogOc,167594023188503,2
7,session_5,strm_demo_user,2022-02-07 17.43.30.268479:+0100,wbkztlnr@zumcpf.com,PublicFieldA_JSyWvVoh,0.212249,PrivateFieldA_SOjxTSGC,852564768616528,0
8,session_6,strm_demo_user,2022-02-07 22.43.26.284456:+0100,dzdbvaly@hbtblw.com,PublicFieldA_EEXuECdg,0.368563,PrivateFieldA_ZUexCLVS,56735494626097,1
9,session_7,strm_demo_user,2022-02-08 03.42.50.399490:+0100,iavumtgg@navmrw.com,PublicFieldA_qwTBCmmE,0.92193,PrivateFieldA_UMHXHyxL,226976555491828,0


## 5. Write data to input data connector

Send the data to the S3 bucket.

In [13]:
class AwsProperties(object):
    aws_access_key_id = '***'
    aws_secret_access_key = '***'
    region = 'eu-central-1'
    bucket = 'databert' 

In [14]:
AWS = AwsProperties()
s3 = boto3.resource(
      service_name='s3',
      region_name=AWS.region,
      aws_access_key_id=AWS.aws_access_key_id,
      aws_secret_access_key=AWS.aws_secret_access_key
    )

In [15]:
resp = s3.Object('databert', 'databert-demo.csv').put(Body=open('./databert-demo.csv', 'rb'))

## 6. Run batch job
We'll be running a batch job from the CLI. We need to pass the configuration of the batch job as an argument. An example configuration can be found below.


```json
{
    "ref": {
      "billing_id": "your_billing_id"
    },
    "source_data": {
      "data_connector_ref": {
        "billing_id": "your_billing_id",
        "name": "databert-demo"
      },
      "file_name": "databert-demo.csv",
      "data_type": {
        "csv": {
          "charset": "UTF-8"
        }
      }
    },
    "consent": {
      "default_consent_levels": [
        0
      ],
      "consent_level_extractor": {
        "field": "PrivacyPlane",
        "field_patterns": {
          "1": {
            "consent_levels": [
              1
            ]
          },
          "2": {
            "consent_levels": [
              2
            ]
          }
        }
      }
    },
    "encryption": {
      "timestamp_config": {
        "field": "Timestamp",
        "format": "yyyy-MM-dd HH.mm.ss.nnnnnn:Z",
        "default_time_zone": {
          "id": "UTC"
        }
      },
      "batch_job_group_id": null
    },
    "event_contract_ref": {
      "handle": "databert-handle",
      "name": "batch_job_public",
      "version": "1.0.0"
    },
    "encrypted_data": {
      "target": {
        "data_connector_ref": {
          "billing_id": "your_billing_id",
          "name": "databert-demo"
        },
        "data_type": {
          "csv": {
            "charset": "UTF-8"
          }
        },
        "file_name": "databert-demo-encrypted.csv"
      }
    },
    "encryption_keys_data": {
      "target": {
        "data_connector_ref": {
          "billing_id": "your_billing_id",
          "name": "databert-demo"
        },
        "data_type": {
          "csv": {
            "charset": "UTF-8"
          }
        },
        "file_name": "databert-demo-encryption-keys.csv"
      }
    },
    "derived_data": [
      {
        "target": {
          "data_connector_ref": {
            "billing_id": "your_billing_id",
            "name": "databert-demo"
          },
          "data_type": {
            "csv": {
              "charset": "UTF-8"
            }
          },
          "file_name": "databert-demo-derived.csv"
        },
        "consent_levels": [
          2
        ],
        "consent_level_type": "CUMULATIVE",
        "masked_fields": {
          "field_patterns": {
            "databert-handle/batch_job_public/1.0.0": {
              "field_patterns": [
                "Email",
                "UserName"
              ]
            }
          }
        }
      }
    ]
  }
```

Now call the batch job: `strm create batch-job -F batch-job.json`

Wait for the job to finish. Status can be checked via `strm list batch-jobs`


## 7. Fetch and explore Data

Now we fetch the encrypted and derived data from the data connector.
We expect from our data contract that the pii-fields `Email`, `PrivateFieldA` and `PrivateFieldB` are encrypted in the encrypted file.
We expect that the fields `Email` and `UserName` are masked with a hash in the derived file. Let's investigate: 

In [16]:
# Get objects from bucket
encrypted = s3.Object('databert', 'databert-demo-encrypted.csv').get()
df_encrypted = pd.read_csv(io.BytesIO(encrypted['Body'].read()))

encryption_keys = s3.Object('databert', 'databert-demo-encryption-keys.csv').get()
df_encryption_keys = pd.read_csv(io.BytesIO(encryption_keys['Body'].read()))

derived = s3.Object('databert', 'databert-demo-derived.csv').get()
df_derived = pd.read_csv(io.BytesIO(derived['Body'].read()))

In [17]:
df_encrypted

Unnamed: 0,SessionId,UserName,Email,PublicFieldA,PublicFieldB,PrivateFieldA,PrivateFieldB,PrivacyPlane,strmMeta.eventContractRef,strmMeta.nonce,strmMeta.timestamp,strmMeta.keyLink,strmMeta.billingId,strmMeta.consentLevels
0,session_1,strm_demo_user,AX+sVreQHft41CYozNAalRpuzy00lJanLD4RI5d9Z/8+Kf...,PublicFieldA_ZIyrjcBj,0.020183,AX+sVrd6wrGKZbnNl1AXhmrp7K00DgWKP6V0cD3BflciW3...,AX+sVrecjlZ4KBfZDI9gjMIoOX/lCyFsL2mbcPIBfUJzsgCR,2,databert-handle/batch_job_public/1.0.0,0,1644126192000,843ba12a-d9c4-4b53-ae99-9fa1c8adc80e,databert986673817,2
1,session_1,strm_demo_user,AX+sVrceKUWi3pbDRblf9UjqX+zMIb1AAu8bpVRAJCemA7...,PublicFieldA_pBznLzVx,0.261294,AX+sVrdJqRHCqZuhLsDsuZQXW9Gv3TckI3MTc7WNo8PjZx...,AX+sVrfoC8CbYIzUoPkIf8ITKr4RL9/+0RHlDk6QolWoUTbl,2,databert-handle/batch_job_public/1.0.0,0,1644144216000,843ba12a-d9c4-4b53-ae99-9fa1c8adc80e,databert986673817,2
2,session_2,strm_demo_user,AX+sVrc7xTqPpID3FaUvhGR3oYCKragNbn1cBP7hxNJheV...,PublicFieldA_SyVIWPEH,0.632461,AX+sVrfQZ6pb8ESQ6i3cAEd0FpmfAOy8uO4WyhQDj3+k/5...,AX+sVrcP4RGKM86jOgm7rdYe2WpJjNOyr0RVLinOSqTg7N4d,1,databert-handle/batch_job_public/1.0.0,0,1644162190000,843ba12a-d9c4-4b53-ae99-9fa1c8adc80e,databert986673817,1
3,session_3,strm_demo_user,AX+sVrcLXAtDrTcXyaVezGK87seVrO+q/Y1QthUEqHcDhq...,PublicFieldA_qvUYaCPR,0.92091,AX+sVrcmwsqzKiuS5Z6rO90G9Gl1ank2n6f8Icz4wYG9TC...,AX+sVrfDmTFRseI4mwllkPjdAJ6jrLTfdYbf6IcLyS69PVQ=,0,databert-handle/batch_job_public/1.0.0,0,1644180182000,843ba12a-d9c4-4b53-ae99-9fa1c8adc80e,databert986673817,0
4,session_3,strm_demo_user,AT1NfVOKYWkt99iRfFEx5a1LidWiq3UBnZiyuNRrJCuk5V...,PublicFieldA_oBodytpb,0.802284,AT1NfVPiAXhJ4KtY5N2ucOfMHoMGQTRPpp3qwNNKx8a2nG...,AT1NfVOpF+3ibfe1sHHav/NnkS7RyjV3avfofWajGC9ZY718,0,databert-handle/batch_job_public/1.0.0,0,1644198190000,1ceae097-c760-42af-9212-144d78944b0e,databert986673817,0
5,session_4,strm_demo_user,AT1NfVNQi2SeLwsLHgi0qKX2sf8T0RuyRNYBxc6vELqWfP...,PublicFieldA_eaPWZEao,0.701134,AT1NfVMuSuRKuRpJj0ouzPBuL7UKTdwK466Y2HQTJzBmjU...,AT1NfVNQIBt246u3xBx6xo9tDZJJQV30BgICuZr/KDIua06x,1,databert-handle/batch_job_public/1.0.0,0,1644216186000,1ceae097-c760-42af-9212-144d78944b0e,databert986673817,1
6,session_5,strm_demo_user,AT1NfVPf5IHajoUlCHWlyHKhGsW/i7QZihDfj/mpEJAA71...,PublicFieldA_ZEiDwfZH,0.681264,AT1NfVNMCky18pGM/uSFYTlesThB9QALXTAGPeNFXnumxj...,AT1NfVMiCY2z8fOCoYenb0CnWW9kAtbjaP4+O6VlIRUaXK1I,2,databert-handle/batch_job_public/1.0.0,0,1644234190000,1ceae097-c760-42af-9212-144d78944b0e,databert986673817,2
7,session_5,strm_demo_user,AT1NfVNY/30jPRNg50lzqJ1AZRdggxODYrcTuIH+ZoIGCK...,PublicFieldA_JSyWvVoh,0.212249,AT1NfVNdQICmqLKtjNwRPIDOzDeUAkYX4Ov9VZnpiSHKdU...,AT1NfVM2+hjPhDrpgmNwBIO203V1QWZiK5+0Vck9JD6tJmAh,0,databert-handle/batch_job_public/1.0.0,0,1644252210000,1ceae097-c760-42af-9212-144d78944b0e,databert986673817,0
8,session_6,strm_demo_user,AT1NfVNTKkIbxrMlKnJpY3GnRvdBFYlUmtwd124mkpG3/x...,PublicFieldA_EEXuECdg,0.368563,AT1NfVO2e4S/9zCy1heXMwF/L+m+2x5A3XW5QHzE1XJ/XY...,AT1NfVPkPwoGgtjhJ8HtltDQQfbRus7skGlv4n3Ui9RzIvI=,1,databert-handle/batch_job_public/1.0.0,0,1644270206000,1ceae097-c760-42af-9212-144d78944b0e,databert986673817,1
9,session_7,strm_demo_user,AWUGvEGwxWtWfrYflkdDEMnpm5gXTKj+Q+uObgP8iBGY4O...,PublicFieldA_qwTBCmmE,0.92193,AWUGvEH36tmpm5MVCKLLVCy2/fgeyJHxBg4IqgWdHpnovb...,AWUGvEH8mmqemMAPsKw8+MD4NtWQv66je5dulzsUnaaXHH8c,0,databert-handle/batch_job_public/1.0.0,0,1644288170000,ea388588-06dc-4e90-8d9e-1651021b5ebe,databert986673817,0


In [18]:
df_derived

Unnamed: 0,SessionId,UserName,Email,PublicFieldA,PublicFieldB,PrivateFieldA,PrivateFieldB,PrivacyPlane,strmMeta.eventContractRef,strmMeta.nonce,strmMeta.timestamp,strmMeta.keyLink,strmMeta.billingId,strmMeta.consentLevels
0,session_1,788d082a29fd07f61e1df95bbe98ef9f,4d0602d8bfe14089fb14cdef7d8380be,PublicFieldA_ZIyrjcBj,0.020183,PrivateFieldA_kHmSNlAn,279909288129237,2,databert-handle/batch_job_public/1.0.0,0,1644126192000,843ba12a-d9c4-4b53-ae99-9fa1c8adc80e,databert986673817,2
1,session_1,788d082a29fd07f61e1df95bbe98ef9f,89a133907a42f4926a7ac68f45872c95,PublicFieldA_pBznLzVx,0.261294,PrivateFieldA_EwVAJxxF,304221592396358,2,databert-handle/batch_job_public/1.0.0,0,1644144216000,843ba12a-d9c4-4b53-ae99-9fa1c8adc80e,databert986673817,2
2,session_5,788d082a29fd07f61e1df95bbe98ef9f,cf3276704ba63f44ea326a06d010884a,PublicFieldA_ZEiDwfZH,0.681264,PrivateFieldA_ZxZuogOc,167594023188503,2,databert-handle/batch_job_public/1.0.0,0,1644234190000,1ceae097-c760-42af-9212-144d78944b0e,databert986673817,2


In [19]:
df_encryption_keys

Unnamed: 0,keyLink,encryptionKey
0,843ba12a-d9c4-4b53-ae99-9fa1c8adc80e,"{""primaryKeyId"":2142000823,""key"":[{""keyData"":{..."
1,1ceae097-c760-42af-9212-144d78944b0e,"{""primaryKeyId"":1028488531,""key"":[{""keyData"":{..."
2,ea388588-06dc-4e90-8d9e-1651021b5ebe,"{""primaryKeyId"":1694940225,""key"":[{""keyData"":{..."
