SOP0126 - Backup Keys For Encryption At Rest
============================================

Description
-----------

Use this notebook to connect to the `controller` database and backup
keys for encryption at rest.

Steps
-----

### Parameters

Set the `backup_file_path`. This is where the backup file will be
placed. Please make sure it has json file extension. Set the
`password_to_encrypt`. This is the password which will be used to
encrypt your certificate.

In [None]:
import os
import tempfile

backup_file_path = os.path.join(tempfile.mkdtemp(), 'hadoopEncryptionKeys.json') # Change path there.
print(f"The keys will be backed up at {backup_file_path}")
password_to_encrypt = "your_password"
print(f"Key(s) will be saved at this path: '{backup_file_path}'. Please make sure you have permission to access this path.")

### Instantiate Kubernetes client

In [None]:
# Instantiate the Python Kubernetes client into 'api' variable

import os
from IPython.display import Markdown

try:
    from kubernetes import client, config
    from kubernetes.stream import stream

    if "KUBERNETES_SERVICE_PORT" in os.environ and "KUBERNETES_SERVICE_HOST" in os.environ:
        config.load_incluster_config()
    else:
        try:
            config.load_kube_config()
        except:
            display(Markdown(f'HINT: Use [TSG118 - Configure Kubernetes config](../repair/tsg118-configure-kube-config.ipynb) to resolve this issue.'))
            raise
    api = client.CoreV1Api()

    print('Kubernetes client instantiated')
except ImportError:
    display(Markdown(f'HINT: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))
    raise

### Get the namespace for the big data cluster

Get the namespace of the Big Data Cluster from the Kuberenetes API.

**NOTE:**

If there is more than one Big Data Cluster in the target Kubernetes
cluster, then either:

-   set \[0\] to the correct value for the big data cluster.
-   set the environment variable AZDATA\_NAMESPACE, before starting
    Azure Data Studio.

In [None]:
# Place Kubernetes namespace name for BDC into 'namespace' variable

if "AZDATA_NAMESPACE" in os.environ:
    namespace = os.environ["AZDATA_NAMESPACE"]
else:
    try:
        namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name
    except IndexError:
        from IPython.display import Markdown
        display(Markdown(f'HINT: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))
        display(Markdown(f'HINT: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))
        display(Markdown(f'HINT: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))
        raise

print('The kubernetes namespace for your big data cluster is: ' + namespace)

### Python function queries `controller` database and return results.

In [None]:
import pandas
from io import StringIO
pandas.set_option('display.max_colwidth', -1)
name = 'controldb-0'
container = 'mssql-server'

def get_dataframe(query):
    # Executes the SQL Query and created a ^ separated data set. The first row has header and the following rows have data.
    command=f"""export SQLCMDPASSWORD=$(cat /var/run/secrets/credentials/mssql-sa-password/password);
    /opt/mssql-tools/bin/sqlcmd -b -S . -U sa -Q "SET NOCOUNT ON;
    {query}" -d controller  -s"^" -W | sed 2d
    """
    output=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)
    return pandas.read_csv(StringIO(output), sep='^')

print("Function 'get_dataframe' defined")

### Python function to execute kubernetes command.

In [None]:
pod_name = 'controldb-0'
container = 'mssql-server'

def execute_k8scommand(command):
    output=stream(api.connect_get_namespaced_pod_exec, pod_name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)
    return str(output)
print("Function 'execute_k8scommand' defined")

### Python function to protect certificate.

Python function to password protect a Base64 Url encoded certificate.
The certificates are the container for the private key which protects
the Key Encryption Keys. These certificates should be password protected
before they are exported.

In [None]:
import sys
import base64

def generated_password_protected_certificate(encoded_certificate, password):
    # The certificate is base64 URL encoded. They need to be decoded first.
    decoded_bytes = base64.urlsafe_b64decode(encoded_certificate + "=" * (4 - len(encoded_certificate) % 4))
    base64_encoded_certificate = base64.b64encode(decoded_bytes).decode("utf-8");

    # Use the kubernetes container to generate a PEM file without persisting to disk, to recreate the PFX with password.
    password_protected_pfx_generation_command = f"""
    generateEncodedPfx() {{ \
        {{ echo $1 | base64 -d | openssl pkcs12 -nocerts -nodes -passin pass:$(echo '') 2>/dev/null ; \
            echo $1 | base64 -d | openssl pkcs12 -nokeys -passin pass:$(echo '') 2>/dev/null ; }} \
            | openssl pkcs12 -export -passout pass:{password} 2>/dev/null \
            | base64 -w 0 
        }}
    generateEncodedPfx {base64_encoded_certificate}
    """

    password_protected_certificate = execute_k8scommand(password_protected_pfx_generation_command)

    return password_protected_certificate

print("Function 'generated_password_protected_certificate' defined.")


### Backup encryption keys.

In [None]:
import json
import base64

symmetric_key_protection_password = base64.b64decode(str(api.read_namespaced_secret("controller-db-rw-secret", namespace).data['encryptionPassword'])).decode('utf-8')

tsql_template = """
OPEN SYMMETRIC KEY ControllerDbSymmetricKey DECRYPTION BY PASSWORD = '{0}'
select account_name, CAST(DecryptByKey([encrypted_password]) AS NVARCHAR(4000)) as unencrypted_password, application_metadata from Credentials where application_metadata like '%hdfsvault-svc%' and type = '{1}'
"""

tsql_secrets_retrieval = tsql_template.format(symmetric_key_protection_password, '2')
df_secrets = get_dataframe(tsql_secrets_retrieval)

tsql_certificate_retrieval = tsql_template.format(symmetric_key_protection_password, '3')
df_cert = get_dataframe(tsql_certificate_retrieval)

with open(backup_file_path, "w+") as file:
    json_keys_entries = []
    for index, row in df_secrets.iterrows():
        data = {'id' : row[0], 'tags': row[2], 'value': row[1], 'type': 2}
        json_keys_entries.append(data)

    for index, row in df_cert.iterrows():
        data = {'id' : row[0], 'tags': row[2], 'value': generated_password_protected_certificate(row[1], password_to_encrypt), 'type': 3}
        json_keys_entries.append(data)

    json.dump(json_keys_entries, file)

print(f"Encryption Key(s) Backed up at location: {backup_file_path}")

In [None]:
print('Notebook execution complete.')