## Load needed libraries

In [34]:
import matplotlib.pyplot as plt
import pandas as pd

import os
import io
import base64
import json

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

import tensorflow as tf

from google.cloud import bigquery, storage
from google.cloud import aiplatform
from google.oauth2 import credentials # NOTE this is for future adaption for MCC deployed solution using creds

### Define the functions to be used for processing and persisting data

The get_blobs function is a generator for iterating over the GCS bucket objects

The write_png_to_gcs aids with the creation of the PNG files in GCS from the original signals spectra files

The write_df_to_gcs is used to create the enriched CSV file from the provided dataframe object

In [21]:
# Generator function for iterating over the blobs in a GCS bucket
def get_blob(blobs):
    for blob in blobs:
        yield blob

# Function to write the PNG file based on the spectra data plot (300x40) 
# to a png image file in the provided GCS 'bucket' at the specified 'image_dir' path        
def write_png_to_gcs(blob, image_dir, bucket):
    print(blob.name)
    #NOTE: important to use pyplot instantiation this way to ensure no memory leaks
    fig = plt.figure(num=1,figsize=(300,40), clear=True)
    data = blob.download_as_bytes()
    df = pd.read_csv(io.BytesIO(data), sep='\s', header=None)
    # df=pd.read_csv(blob.name, sep='\s', header=None)
    # filename=filename.split('.')[0]
    df.columns=['x_axis', 'y_axis']
    ax = fig.add_subplot()
    ax.plot(df['x_axis'], df['y_axis'])
    ax.axis('off')

    buf = io.BytesIO()
    fig.savefig(buf, format='png')

    filename = blob.name.split('.')[0] # remove the suffix/file extension
    filename = filename.split('/')[1] # remove the containing directory name from filename

    upload_blob = bucket.blob(image_dir+filename+'_nmr.png')
    upload_blob.upload_from_file(buf, content_type='image/png', rewind=True)

    # plt.close()
    buf.close()
    del df
    # del plt
    del data
    del upload_blob
    
    return

# Write DataFrame content to the specified 'file_path' in the given GCS 'bucket'
def write_df_to_gcs(df, file_path, bucket):
    upload_blob = bucket.blob(file_path)
    upload_blob.upload_from_string(df.to_csv(), content_type='text/csv')
    
    del upload_blob
    
    return

### Data enrichment functions
Define the enrichment functions to calculate eGFR and time.TX values for each row of our dataframe

In [None]:
# NOTE: Carried over from Jeff's notebook here for pre-processing and feature engg
# Define the CKD-EPI equation function
def calculate_eGFR(row):
    if row['Sex'] == 'male':
        kappa = 0.9
        alpha = -0.302
        beta = 1.0
    else:
        kappa = 0.7
        alpha = -0.241
        beta = 1.012

    eGFR = 142 * min(row['serum_creatinine'] / kappa, 1)**alpha * \
           max(row['serum_creatinine'] / kappa, 1)**(-1.2) * \
           0.9938**row['Patient.Age.at.Biopsy'] * beta
    return eGFR

#Calculate the age difference (in years) between the Biopsy and the transplant
def calculate_time(row):
    return row['Patient.Age.at.Biopsy'] - row['Patient.Age.at.TX']



### Data processing functions
The following functions would be used for:  
    1. <b>create_input_layers()</b> : Create a dictionary of Keres input layers for each feature  
    2. <b>transform(inputs)</b> : Create a dictionary of transformed input tensors   
    3. <b>df_to_dataset(dataframe, shuffle, batch_size) </b> : Create a tf.data dataset from a Pandas dataframe  

In [None]:
def create_input_layers():
    """Creates dictionary of input layers for each feature.

    Returns:
        Dictionary of `tf.Keras.layers.Input` layers for each feature.
    """
    inputs = {
        colname: tf.keras.layers.Input(
            name=colname, shape=(1,), dtype="float32"
        )
        for colname in NUMERICAL_COLUMNS
    }

    inputs.update(
        {
            colname: tf.keras.layers.Input(
                name=colname, shape=(1,), dtype="string"
            )
            for colname in CATEGORICAL_COLUMNS
        }
    )

    return inputs

def transform(inputs):
    """Creates dictionary of transformed inputs.

    Returns:
        Dictionary of transformed Tensors
    """

    transformed = {}
    for numerical_column in NUMERICAL_COLUMNS:
        transformed[numerical_column] = inputs[numerical_column]

    vocab = {
        "Sex": ["male", "female", "unknown"],
        "Diabetes": ["True", "False"],
        "Hypertension": ["True", "False"],
        "UA.Pro": ["True", "False", "NaN"],
        "UA.Hb": ["True", "False", "NaN"],
    }

    for categorical_column in CATEGORICAL_COLUMNS:
        transformed[categorical_column] = tf.keras.layers.StringLookup(
            vocabulary=vocab[categorical_column], output_mode="one_hot"
        )(inputs[categorical_column])

    return transformed

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop("median_house_value")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [None]:
## NOTE: This is commented out as the images (png files) for the spectra are already generated now
# image_dir='spec_train_output/images/'

# initialize the GCS client
storage_client = storage.Client()

# get the storage bucket
bucket = storage_client.get_bucket('spectrain')

## NOTE: This is commented out as the images (png files) for the spectra are already generated now
# # Note: Client.list_blobs requires at least package version 1.17.0.
# blobs = storage_client.list_blobs('spectrain', prefix='Kidney_TX_Data')

# i = 0 # counter to use for breaking

# # Note: The call returns a response only when the iterator is consumed.
# for blob in get_blob(blobs):
#     if("output" in blob.name):
#         write_png_to_gcs(blob, image_dir, bucket)
#         # i = i+1
#         # if(i == 3):
#         #     break;

df=pd.read_csv('gs://spectrain/Kidney_TX_Data/Kidney_TX_data.csv')
df = df.filter(regex=r'^(?!LS|Banff|Biopsy|Source|Patient.S|Nmr)')

# Apply the calculate_eGFR function to create the 'eGFR' column
df['eGFR'] = df.apply(calculate_eGFR, axis=1)
# Apply the calculate_time function to create the 'time.TX' column
df['time.TX'] = df.apply(calculate_time, axis=1)

write_df_to_gcs(df, 'Kidney_TX_enriched_data.csv', bucket)

df.head()

### Define a Simple LogisticRegression model for our tabular features and label (Case)

We will use the dataframe 'df' formed above to form a simple LogisticRegression model and make some
predictions and evaluate against the testing data and report that

In [3]:
df=pd.read_csv('gs://spectrain/Kidney_TX_enriched_data.csv')
df = df.dropna()

X = df[["serum_creatinine","Sex","hippurate","phenylacetylglutamine",
        "trigonellin","urea","citrate","dimethylamine","lactate",
        "Diabetes","Hypertension","UA.Pro","UA.Hb","eGFR","time.TX"]]
y = df["Case"]


NUMERICAL_COLUMNS = ["serum_creatinine", "hippurate", "phenylacetylglutamine", "trigonellin",
                    "urea", "citrate", "dimethylamine", "lactate", "eGFR", "time.TX"]
CATEGORICAL_COLUMNS = ["Sex", "Diabetes", "Hypertension", "UA.Pro", "UA.Hb"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# pipeline for numerical columns
num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

# # pipeline for categorical columns
# cat_pipe = make_pipeline(
#     SimpleImputer(strategy='constant', fill_value='N/A'),
#     OneHotEncoder(handle_unknown='ignore', sparse=False)
# )

# combine both the pipelines
# full_pipe = ColumnTransformer([
#     ('num', num_pipe, num_cols),
#     ('cat', cat_pipe, cat_cols)
# ])

# # Create input layer
# inputs = create_input_layers()

# # transform
# transformed = transform(inputs)

logreg = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("simple", SimpleImputer(strategy='median'), NUMERICAL_COLUMNS),
            ("ohe", OneHotEncoder(sparse=False), CATEGORICAL_COLUMNS),
            ("scale", StandardScaler(with_mean=True), NUMERICAL_COLUMNS),
        ],
        remainder="passthrough",
    ),
    LogisticRegression(max_iter=10000, random_state=42),
)

# train the model
logreg.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simple',
                                                  SimpleImputer(strategy='median'),
                                                  ['serum_creatinine',
                                                   'hippurate',
                                                   'phenylacetylglutamine',
                                                   'trigonellin', 'urea',
                                                   'citrate', 'dimethylamine',
                                                   'lactate', 'eGFR',
                                                   'time.TX']),
                                                 ('ohe',
                                                  OneHotEncoder(sparse=False),
                                                  ['Sex', 'Diabetes',
                                                   'Hy

Now predict the outcomes for the testing data

In [4]:
predictions = logreg.predict(X_test)

Finally, show the classification report

In [5]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.73      0.96      0.83       197
           1       0.67      0.17      0.27        84

    accuracy                           0.73       281
   macro avg       0.70      0.57      0.55       281
weighted avg       0.71      0.73      0.66       281



### Trying a Keras based deep learning model with a neural network of 3 layers

First dense layer of 16, second dense layer of 8 and the last layer using sigmoid activation. 
The first 2 dense layers would use ReLU as activation function.

In [10]:
#re-initialize the dataframe
df = pd.read_csv('gs://spectrain/Kidney_TX_enriched_data.csv')
df = df.dropna()

#TODO Try Dense layers with more neurons (200s) per suggestion from Benoit
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(15,)),
    tf.keras.layers.Dense(16, activation=tf.nn.relu),
	tf.keras.layers.Dense(8, activation=tf.nn.relu),
	# tf.keras.layers.Dense(4, activation=tf.nn.relu), # added one after the 3 layer
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid),
])
# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['AUC'])

# NOTE: cannot use the following construct as unable to use .fit with support for epochs and batch_size params
# dnn = make_pipeline(
#     ColumnTransformer(
#         transformers=[
#             ("simple", SimpleImputer(strategy='median'), NUMERICAL_COLUMNS),
#             ("ohe", OneHotEncoder(sparse=False), CATEGORICAL_COLUMNS),
#             ("scale", StandardScaler(with_mean=True), NUMERICAL_COLUMNS),
#         ],
#         remainder="passthrough",
#     ),
#     model,
# )

# [print(i.shape, i.dtype) for i in model.inputs]
# [print(o.shape, o.dtype) for o in model.outputs]
# [print(l.name, l.input_shape, l.dtype) for l in model.layers]

data_dummy = pd.get_dummies(df[CATEGORICAL_COLUMNS], drop_first=True)

df = pd.concat([df, data_dummy], axis=1)

# drop the original categorical columns that have been one hot encoded
df = df.drop(CATEGORICAL_COLUMNS, axis=1)

# df = df.dropna()

X = df[["serum_creatinine","Sex_male","hippurate","phenylacetylglutamine",
        "trigonellin","urea","citrate","dimethylamine","lactate",
        "Diabetes_True","Hypertension_True","UA.Pro_True","UA.Hb_True","eGFR","time.TX"]]
y = df["Case"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

X_train.head()

# dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
# train_data = dataset.shuffle(len(X_train)).batch(32)
# train_data = train_data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
# valid_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))
# model.fit(train_data, epochs=200, batch_size=4, validation_data=valid_ds)


model.fit(X_train, y_train, epochs=240, batch_size=6)

# test_loss, test_acc = model.evaluate(X_test, y_test)
# print(f"Test Loss: {test_loss} and test accuracy: {test_acc}")

# NOTE: For printing loss and AUC
test_loss, test_auc = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss} and test AUC: {test_auc}")

# Sample output with 1000 epochs and 32 batch size (0.91 for the loss is less than ideal :( )
# Test Loss: 0.9106149673461914 and test accuracy: 0.7298578023910522

#TODO perhaps try loss, mse instead of loss, accuracy?

Epoch 1/240
Epoch 2/240
Epoch 3/240
Epoch 4/240
Epoch 5/240
Epoch 6/240
Epoch 7/240
Epoch 8/240
Epoch 9/240
Epoch 10/240
Epoch 11/240
Epoch 12/240
Epoch 13/240
Epoch 14/240
Epoch 15/240
Epoch 16/240
Epoch 17/240
Epoch 18/240
Epoch 19/240
Epoch 20/240
Epoch 21/240
Epoch 22/240
Epoch 23/240
Epoch 24/240
Epoch 25/240
Epoch 26/240
Epoch 27/240
Epoch 28/240
Epoch 29/240
Epoch 30/240
Epoch 31/240
Epoch 32/240
Epoch 33/240
Epoch 34/240
Epoch 35/240
Epoch 36/240
Epoch 37/240
Epoch 38/240
Epoch 39/240
Epoch 40/240
Epoch 41/240
Epoch 42/240
Epoch 43/240
Epoch 44/240
Epoch 45/240
Epoch 46/240
Epoch 47/240
Epoch 48/240
Epoch 49/240
Epoch 50/240
Epoch 51/240
Epoch 52/240
Epoch 53/240
Epoch 54/240
Epoch 55/240
Epoch 56/240
Epoch 57/240
Epoch 58/240
Epoch 59/240
Epoch 60/240
Epoch 61/240
Epoch 62/240
Epoch 63/240
Epoch 64/240
Epoch 65/240
Epoch 66/240
Epoch 67/240
Epoch 68/240
Epoch 69/240
Epoch 70/240
Epoch 71/240
Epoch 72/240
Epoch 73/240
Epoch 74/240
Epoch 75/240
Epoch 76/240
Epoch 77/240
Epoch 78

### Validation results

With 240 epochs and 3 layers of 16, 8 and 1 (sigmoid) and batch size 8   
Epoch 240/240   
145/145 [==============================] - 0s 1ms/step - loss: 0.5399 - accuracy: 0.7476   
10/10 [==============================] - 0s 2ms/step - loss: 0.5679 - accuracy: 0.7266   
Test Loss: 0.5679342746734619 and test accuracy: 0.7266436219215393   
   
   
With 240 epochs and 4 layers of 16, 8, 4 and 1 (sigmoid) and batch size 8   
Epoch 240/240   
145/145 [==============================] - 0s 1ms/step - loss: 0.5415 - accuracy: 0.7415   
10/10 [==============================] - 0s 1ms/step - loss: 0.5757 - accuracy: 0.7370   
Test Loss: 0.5757291316986084 and test accuracy: 0.7370242476463318   

With 240 epochs and 4 layers of 16, 8, 4 and 1 (sigmoid) and batch size 6   
Epoch 240/240   
193/193 [==============================] - 0s 2ms/step - loss: 0.5291 - accuracy: 0.7476   
10/10 [==============================] - 0s 2ms/step - loss: 0.5814 - accuracy: 0.7405   
Test Loss: 0.5814240574836731 and test accuracy: 0.7404844164848328


### Upload the CSV based DNN model and deploy to Vertex AI  

Upload the DNN model created for the hyperparameter tuning in the 'spectrain_new/spectrain_csv_dnn/tuned_20230615_175922' GCS path and deploy it to Vertex AI

In [2]:
PROJECT = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT = PROJECT[0]
PROJECT

'qwiklabs-asl-00-c812c3b423f2'

In [6]:
REGION = "us-central1"
BUCKET = "spectrain_new"

MODEL_DISPLAYNAME = "spectrain_csv_dnn/tuned_20230615_175922/20230615181110"
# us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest
SERVING_CONTAINER_IMAGE_URI = (
    "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest"
)
os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION

uploaded_model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAYNAME,
    artifact_uri=f"gs://{BUCKET}/{MODEL_DISPLAYNAME}", # TODO: Your code here
    serving_container_image_uri=SERVING_CONTAINER_IMAGE_URI  # TODO: Your code here
)

Creating Model
Create Model backing LRO: projects/469700469475/locations/us-central1/models/1914199166423138304/operations/1252336253613899776
Model created. Resource name: projects/469700469475/locations/us-central1/models/1914199166423138304@1
To use this Model in another session:
model = aiplatform.Model('projects/469700469475/locations/us-central1/models/1914199166423138304@1')


### Deploy an endpoint for the uploaded model with a n1-standard-2 machine

In [7]:
MACHINE_TYPE = "n1-standard-2"

endpoint = uploaded_model.deploy(
    machine_type=MACHINE_TYPE,
    accelerator_type=None,
    accelerator_count=None,
)

Creating Endpoint
Create Endpoint backing LRO: projects/469700469475/locations/us-central1/endpoints/8892177144336089088/operations/5699640885642264576
Endpoint created. Resource name: projects/469700469475/locations/us-central1/endpoints/8892177144336089088
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/469700469475/locations/us-central1/endpoints/8892177144336089088')
Deploying model to Endpoint : projects/469700469475/locations/us-central1/endpoints/8892177144336089088
Deploy Endpoint model backing LRO: projects/469700469475/locations/us-central1/endpoints/8892177144336089088/operations/3074042302885265408
Endpoint model deployed. Resource name: projects/469700469475/locations/us-central1/endpoints/8892177144336089088


### Upload the images based CNN model and deploy to Vertex AI

NOTE: We need to replace the GCS path value with the right one for the CNN model  

Upload the CNN model created for the hyperparameter tuning in the 'spectrain_new/<REST_OF_PATH>' GCS path and deploy it to Vertex AI

In [8]:
REGION = "us-central1"
BUCKET = "spectrain_new"

MODEL_DISPLAYNAME = "spectrain_cnn" # TODO

# us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest
SERVING_CONTAINER_IMAGE_URI = (
    "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest"
)
os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION

uploaded_model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAYNAME,
    artifact_uri=f"gs://{BUCKET}/{MODEL_DISPLAYNAME}", # TODO: Your code here
    serving_container_image_uri=SERVING_CONTAINER_IMAGE_URI  # TODO: Your code here
)

Creating Model
Create Model backing LRO: projects/469700469475/locations/us-central1/models/4203153677034192896/operations/4600762576563863552
Model created. Resource name: projects/469700469475/locations/us-central1/models/4203153677034192896@1
To use this Model in another session:
model = aiplatform.Model('projects/469700469475/locations/us-central1/models/4203153677034192896@1')


### Process all test images into a single JSON for batch prediction

Take the test image files from the 'bhavani/transformed_images' folder in the GCS bucket and write the contents as base64 encoded data to the JSON file used for batch predictions  
  
Name of batch prediction 'spectrain_cnn_batch_test' (against the test dataset images)

In [46]:
# def write_file(blob, f):
#     blob_data = blob.download_as_bytes()
    
#     data = {"data": base64.b64encode(blob_data).decode("utf-8")}
#     f.write(json.dumps(data) + "\n")

def write_file_to_gcs(image_file, f):
    # initialize the GCS client
    storage_client = storage.Client()

    # get the storage bucket
    bucket = storage_client.get_bucket('spectrain')

    # NOTE: This is commented out as the images (png files) for the spectra are already generated now
    # Note: Client.list_blobs requires at least package version 1.17.0.
    # spectrain_new/bhavani/transformed_images
    blobs = storage_client.list_blobs('spectrain_new', prefix='bhavani/transformed_images')

    i = 0 # counter to use for breaking

    # Note: The call returns a response only when the iterator is consumed.
    for blob in get_blob(blobs):
        blobname = blob.name.split('.')[0]
        blobname = blobname.split('/')[2]
        
        if(blobname == image_file):
            print(f"Writing image_file : {image_file} to JSON")
            blob_data = blob.download_as_bytes()
    
            data = {"data": base64.b64encode(blob_data).decode("utf-8")}
            f.write(json.dumps(data) + "\n")
            # write_file(blob, f)


### Write JSONL file with content as the list of GCS paths to a single image per line

Adjust and createt new function to make JSONL file with the list of the images as specified in the documentation here:  
https://cloud.google.com/vertex-ai/docs/image-data/classification/get-predictions

In [51]:
def write_filepath_to_gcs(image_file, f):
    # initialize the GCS client
    storage_client = storage.Client()

    # get the storage bucket
    bucket = storage_client.get_bucket('spectrain')

    # NOTE: This is commented out as the images (png files) for the spectra are already generated now
    # Note: Client.list_blobs requires at least package version 1.17.0.
    # spectrain_new/bhavani/transformed_images
    blobs = storage_client.list_blobs('spectrain_new', prefix='bhavani/transformed_images')

    i = 0 # counter to use for breaking

    # Note: The call returns a response only when the iterator is consumed.
    for blob in get_blob(blobs):
        blobname = blob.name.split('.')[0]
        blobname = blobname.split('/')[2]
        
        if(blobname == image_file):
            print(f"Writing image_file path : {blob} to JSON")
            # blob_data = blob.download_as_bytes()
    
            data = {"content": "gs://spectrain_new/"+blob.name, "mimeType": "image/png"}
            # print(json.dumps(data))
            f.write(json.dumps(data) + "\n")


In [None]:
# Location to use for CNN images JSON for batch predictions
# spectrain_new/bhavani/batch_predictions/IMG_CNN

# NOTE: use the transformed_images folder in 'bhavani'
# spectrain_new/bhavani/transformed_images


## IMP NOTE: Commenting this block as the JSON file for the test images was already 
# created by the execution of the underlying code
# NOTE determine the test images from the _split csv file
# spectrain_new/Kidney_TX_data_with_split.csv
# df1=pd.read_csv('gs://spectrain_new/Kidney_TX_data_with_split.csv')

# df1= df1[["Spectrum_file","data_split"]]

# df1 = df1.loc[df1["data_split"] == "TEST"]

# gcs_input_uri = "gs://spectrain_new/bhavani/batch_predictions/IMG_CNN/src/test_images_new.json"

# with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
#     for fl in df1["Spectrum_file"]:
#         fl_name = fl.split('.')[0] + "_nmr"
#         write_filepath_to_gcs(fl_name,f)
#         # break

# f.close()