### Installing requirements

In [1]:
!pip3 install -r requirements.txt

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.11.0+cpu
  Downloading https://download.pytorch.org/whl/cpu/torch-1.11.0%2Bcpu-cp38-cp38-linux_x86_64.whl (169.2 MB)
[K     |████████████████████████████████| 169.2 MB 66 kB/s s eta 0:00:01    |█████████▎                      | 49.2 MB 20.7 MB/s eta 0:00:06     |█████████▌                      | 50.4 MB 20.7 MB/s eta 0:00:06     |██████████▊                     | 56.7 MB 20.7 MB/s eta 0:00:06     |███████████▋                    | 61.6 MB 20.7 MB/s eta 0:00:06     |███████████████████▋            | 103.6 MB 18.0 MB/s eta 0:00:04     |██████████████████████▏         | 117.3 MB 21.9 MB/s eta 0:00:03MB/s eta 0:00:02     |████████████████████████████▉   | 152.2 MB 24.9 MB/s eta 0:00:01
Collecting boto3
  Downloading boto3-1.24.24-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 21.0 MB/s eta 0:00:01
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.

### Training Pipeline

In [12]:
%%writefile ./single_layer_ann_training_pipeline.py
# The above line just writes this cell into a Python file; this is used in the KFP DSL command.

# Global Kubeflow Pipelines imports.
import kfp
import kfp.components as comp


def unzip_data(bucket_zipfile_path: str, output_str: comp.OutputPath(str)):
    # Imports required for the Pipeline Component.
    from io import BytesIO

    import zipfile
    import boto3
    import os

    # Download a ZIP file from S3.
    path_bucket = 'datakflow'
    path_to_move_file = ''

    os.makedirs('./data', exist_ok=True)
    os.makedirs('./unzipped_data', exist_ok=True)

    boto3.resource('s3').Object(path_bucket, bucket_zipfile_path).download_file(Filename='./data/zipfile.zip')

    for zip in os.listdir('./data'):
        with zipfile.ZipFile(os.path.join('./data', zip), 'r') as file:
            file.extractall('./unzipped_data')

    # Extract all files out of the ZIP file and write them back to S3.
    s3_client = boto3.client('s3')
    for file in os.listdir('./unzipped_data'):
        output_path = path_to_move_file + file
        s3_client.upload_file(
            os.path.join('./unzipped_data', file),
            path_bucket, output_path
        )

    # Write the path of the required file into an artifact.
    with open(output_str, 'w') as writer:
        writer.write(output_path)


def read_data(
    bucket_name: str, csv_path: comp.InputPath(str), sep: str,
    decimal: str, encoding: str, output_csv: comp.OutputPath('CSV')
):
    # Imports required for the Pipeline Component.
    from io import StringIO

    import pandas as pd
    import boto3
    import os

    # Download the unzipped file from S3 into a CSV string.
    with open(csv_path, 'r') as reader:
        line = reader.readline()
        csv_obj = boto3.client('s3').get_object(Bucket=bucket_name, Key=line)
    body = csv_obj['Body']
    csv_string = body.read().decode(encoding)

    # Create a pandas DataFrame out of the CSV string.
    df = pd.read_csv(
        StringIO(csv_string), sep=sep, decimal=decimal,
        error_bad_lines=False, encoding=encoding, usecols=['sequence']
    )

    # Write the CSV into a Kubeflow Pipelines artifact.
    df.to_csv(output_csv, index=True, header=True)


def preprocess_data(csv_path: comp.InputPath('CSV'), sequence_json: comp.OutputPath()):
    # Imports required for the Pipeline Component.
    import pandas as pd
    import numpy as np
    import json
    
    # Read from the artifact CSV.
    df = pd.read_csv(csv_path)

    # Preprocess the dataset.
    df['sequence'] = df['sequence'].replace('[]', np.nan).copy()
    mask = ~(df['sequence'].isna())
    sequences = df.loc[mask, 'sequence']
    df = None
    sequences = [eval(sequence) for sequence in sequences]

    # Write the preprocessed data into an artifact.
    with open(sequence_json, 'w') as f:
        json.dump(sequences, f)


def model_training(sequence_json: comp.InputPath(), model_art: comp.OutputBinaryFile(bytes)):
    # Imports required for the Pipeline Component.
    import torch
    import json
    import boto3

    # Read the preprocessed data from the artifact.
    with open(sequence_json, 'r') as f:
        sequences = json.load(f)

    # Setting up Dataset and DataLoader for torch model.
    train_size = 9000
    X_train = torch.Tensor([sequence[:-1] for sequence in sequences[:train_size] if sequence[-1] > -1])
    y_train = torch.Tensor([sequence[-1] for sequence in sequences[:train_size] if sequence[-1] > -1]).long()

    # Dataset class.
    class LogDataset(torch.utils.data.Dataset):
        def __init__(self, X, y):
            self.sequences = X
            self.labels = y

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            sequence = self.sequences[idx]
            label = self.labels[idx]
            return sequence, label

    train_dataset = LogDataset(X_train, y_train)

    # Hyperparameters.
    learning_rate = 2e-3
    batch_size = 160
    epochs = 13

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

    # Model class: single-layer A.N.N.
    class SimpleNN(torch.nn.Module):
        def __init__(self, input_size, num_keys):
            super(SimpleNN, self).__init__()
            self.fc = torch.nn.Linear(input_size, num_keys)

        def forward(self, x):
            out = self.fc(x)
            return out

    # Training loop over batches of dataset.
    def train_loop(dataloader, model, loss_fn, optimizer):
        size = len(dataloader.dataset)
        for batch, (X, y) in enumerate(dataloader):
            pred = model(X)
            loss = loss_fn(pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 100 == 0:
                loss, current = loss.item(), batch * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    # Setting up the training loop.
    loss_fn = torch.nn.CrossEntropyLoss()
    model = SimpleNN(10, 11)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    # Looping over epochs.
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(train_dataloader, model, loss_fn, optimizer)
        print('Done!')

    # Saving the model into local and an artifact.
    checkpoint = {'state_dict': model.state_dict()}
    torch.save(checkpoint, model_art)
    torch.save(checkpoint, 'checkpoint.pth')

    # Uploading the local file into S3.
    path_bucket = 'datakflow'
    path_to_move_file = ''

    boto3.client('s3').upload_file(
        'checkpoint.pth',
        path_bucket, 'checkpoint.pth'
    )


def model_evaluating(
    sequence_json: comp.InputPath(), model_art: comp.InputBinaryFile(bytes),
    arr_output_path: comp.OutputPath(str), mlpipeline_ui_metadata_path: comp.OutputPath()
):
    # Imports required for the Pipeline Component.
    import pandas as pd
    import torch
    import json

    from sklearn.metrics import confusion_matrix

    # Reading the preprocessed data from the artifact.
    with open(sequence_json, 'r') as f:
        sequences = json.load(f)

    # Setting up Dataset and DataLoader for torch model.
    train_size = 9000
    X_valid = torch.Tensor([sequence[:-1] for sequence in sequences[train_size:] if sequence[-1] > -1])
    y_valid = torch.Tensor([sequence[-1] for sequence in sequences[train_size:] if sequence[-1] > -1]).long()

    # Dataset class.
    class LogDataset(torch.utils.data.Dataset):
        def __init__(self, X, y):
            self.sequences = X
            self.labels = y

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            sequence = self.sequences[idx]
            label = self.labels[idx]
            return sequence, label

    valid_dataset = LogDataset(X_valid, y_valid)

    # Hyperparameters.
    batch_size = 64
    
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)

    # Inference loop over batches of dataset.
    def test_loop(dataloader, model, loss_fn):
        size = len(dataloader.dataset)
        num_batches = len(dataloader)
        test_loss, correct = 0, 0

        targets, preds = [], []
        with torch.no_grad():
            for X, y in dataloader:
                pred = model(X)
                test_loss += loss_fn(pred, y).item()
                correct += (pred.argmax(1) == y).type(torch.float).sum().item()
                preds += pred.argmax(1).tolist()
                targets += y.tolist()

        test_loss /= num_batches
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

        vocab = pd.Series(targets).unique()
        confusion_matrix_arr = confusion_matrix(targets, preds)
        with open(arr_output_path, 'w') as f:
            f.write(str(confusion_matrix_arr))
        data = []
        for i in range(len(confusion_matrix_arr)):
            for j in range(len(confusion_matrix_arr[i])):
                data.append({'target': vocab[i], 'predicted': vocab[j], 'count': confusion_matrix_arr[i][j]})
        df = pd.DataFrame(data)
        source = df.to_csv(index=False)
        metadata = {
            'outputs': [{
                'type': 'confusion_matrix',
                'format': 'csv',
                'schema': [
                    {'name': 'target', 'type': 'CATEGORY'},
                    {'name': 'predicted', 'type': 'CATEGORY'},
                    {'name': 'count', 'type': 'NUMBER'},
                ],
                'storage': 'inline',
                'source': source,
                'labels': list(map(str, vocab)),
            }]
        }

        with open(mlpipeline_ui_metadata_path, 'w') as f:
            json.dump(metadata, f)

    # Model class: single-layer A.N.N.
    class SimpleNN(torch.nn.Module):
        def __init__(self, input_size, num_keys):
            super(SimpleNN, self).__init__()
            self.fc = torch.nn.Linear(input_size, num_keys)

        def forward(self, x):
            out = self.fc(x)
            return out

    # Loading the model from the artifact.
    checkpoint = torch.load(model_art)['state_dict']
    model = SimpleNN(10, 11)
    model.load_state_dict(checkpoint)
    model.eval()

    # Setting up the inference loop.
    loss_fn = torch.nn.CrossEntropyLoss()

    # The inference loop.
    test_loop(valid_dataloader, model, loss_fn)


base_img = "sent2020/kflow1:latest"  # The base container image to be used by pods running the Components.

# Create components from the functions above.
unzip_data_op        = kfp.components.create_component_from_func(unzip_data, base_image=base_img)
read_data_op         = kfp.components.create_component_from_func(read_data, base_image=base_img)
preprocess_data_op   = kfp.components.create_component_from_func(preprocess_data, base_image=base_img)
model_training_op    = kfp.components.create_component_from_func(model_training, base_image=base_img)
model_evaluating_op  = kfp.components.create_component_from_func(
    model_evaluating, base_image=base_img, packages_to_install=['scikit-learn']
)

# Create the pipeline from the components created above.
@kfp.dsl.pipeline(
    name='single-layer-ann-training-pipeline',
    description='Trains a single-layer A.N.N. to find anomalies in string sequences'
)
def unzip_and_read_pipeline(
    bucket_zipfile_path: str, bucket_name: str,
    sep: str, decimal: str, encoding: str
):
    first_task = unzip_data_op(bucket_zipfile_path)
    second_task = read_data_op(bucket_name, first_task.outputs['output_str'], sep, decimal, encoding)
    third_task = preprocess_data_op(second_task.outputs['output_csv'])
    fourth_task = model_training_op(third_task.outputs['sequence_json'])
    fifth_task = model_evaluating_op(third_task.outputs['sequence_json'], fourth_task.outputs['model_art'])

Overwriting ./single_layer_ann_training_pipeline.py


In [13]:
%%sh
dsl-compile --py single_layer_ann_training_pipeline.py --output single_layer_ann_training_pipeline.yaml
# Compilation of the pipeline code into a YAML.