## Load needed libraries

In [67]:
import matplotlib.pyplot as plt
import pandas as pd

import os
import io

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

import tensorflow as tf

from google.cloud import bigquery, storage
from google.oauth2 import credentials # NOTE this is for future adaption for MCC deployed solution using creds

### Define the functions to be used for processing and persisting data

The get_blobs function is a generator for iterating over the GCS bucket objects

The write_png_to_gcs aids with the creation of the PNG files in GCS from the original signals spectra files

The write_df_to_gcs is used to create the enriched CSV file from the provided dataframe object

In [2]:
# Generator function for iterating over the blobs in a GCS bucket
def get_blob(blobs):
    for blob in blobs:
        yield blob

# Function to write the PNG file based on the spectra data plot (300x40) 
# to a png image file in the provided GCS 'bucket' at the specified 'image_dir' path        
def write_png_to_gcs(blob, image_dir, bucket):
    print(blob.name)
    #NOTE: important to use pyplot instantiation this way to ensure no memory leaks
    fig = plt.figure(num=1,figsize=(300,40), clear=True)
    data = blob.download_as_bytes()
    df = pd.read_csv(io.BytesIO(data), sep='\s', header=None)
    # df=pd.read_csv(blob.name, sep='\s', header=None)
    # filename=filename.split('.')[0]
    df.columns=['x_axis', 'y_axis']
    ax = fig.add_subplot()
    ax.plot(df['x_axis'], df['y_axis'])
    ax.axis('off')

    buf = io.BytesIO()
    fig.savefig(buf, format='png')

    filename = blob.name.split('.')[0] # remove the suffix/file extension
    filename = filename.split('/')[1] # remove the containing directory name from filename

    upload_blob = bucket.blob(image_dir+filename+'_nmr.png')
    upload_blob.upload_from_file(buf, content_type='image/png', rewind=True)

    # plt.close()
    buf.close()
    del df
    # del plt
    del data
    del upload_blob
    
    return

# Write DataFrame content to the specified 'file_path' in the given GCS 'bucket'
def write_df_to_gcs(df, file_path, bucket):
    upload_blob = bucket.blob(file_path)
    upload_blob.upload_from_string(df.to_csv(), content_type='text/csv')
    
    del upload_blob
    
    return

### Data enrichment functions
Define the enrichment functions to calculate eGFR and time.TX values for each row of our dataframe

In [28]:
# NOTE: Carried over from Jeff's notebook here for pre-processing and feature engg
# Define the CKD-EPI equation function
def calculate_eGFR(row):
    if row['Sex'] == 'male':
        kappa = 0.9
        alpha = -0.302
        beta = 1.0
    else:
        kappa = 0.7
        alpha = -0.241
        beta = 1.012

    eGFR = 142 * min(row['serum_creatinine'] / kappa, 1)**alpha * \
           max(row['serum_creatinine'] / kappa, 1)**(-1.2) * \
           0.9938**row['Patient.Age.at.Biopsy'] * beta
    return eGFR

#Calculate the age difference (in years) between the Biopsy and the transplant
def calculate_time(row):
    return row['Patient.Age.at.Biopsy'] - row['Patient.Age.at.TX']



### Data processing functions
The following functions would be used for:  
    1. <b>create_input_layers()</b> : Create a dictionary of Keres input layers for each feature  
    2. <b>transform(inputs)</b> : Create a dictionary of transformed input tensors   
    3. <b>df_to_dataset(dataframe, shuffle, batch_size) </b> : Create a tf.data dataset from a Pandas dataframe  

In [None]:
def create_input_layers():
    """Creates dictionary of input layers for each feature.

    Returns:
        Dictionary of `tf.Keras.layers.Input` layers for each feature.
    """
    inputs = {
        colname: tf.keras.layers.Input(
            name=colname, shape=(1,), dtype="float32"
        )
        for colname in NUMERICAL_COLUMNS
    }

    inputs.update(
        {
            colname: tf.keras.layers.Input(
                name=colname, shape=(1,), dtype="string"
            )
            for colname in CATEGORICAL_COLUMNS
        }
    )

    return inputs

def transform(inputs):
    """Creates dictionary of transformed inputs.

    Returns:
        Dictionary of transformed Tensors
    """

    transformed = {}
    for numerical_column in NUMERICAL_COLUMNS:
        transformed[numerical_column] = inputs[numerical_column]

    vocab = {
        "Sex": ["male", "female", "unknown"],
        "Diabetes": ["True", "False"],
        "Hypertension": ["True", "False"],
        "UA.Pro": ["True", "False", "NaN"],
        "UA.Hb": ["True", "False", "NaN"],
    }

    for categorical_column in CATEGORICAL_COLUMNS:
        transformed[categorical_column] = tf.keras.layers.StringLookup(
            vocabulary=vocab[categorical_column], output_mode="one_hot"
        )(inputs[categorical_column])

    return transformed

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop("median_house_value")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [4]:
## NOTE: This is commented out as the images (png files) for the spectra are already generated now
# image_dir='spec_train_output/images/'

# initialize the GCS client
storage_client = storage.Client()

# get the storage bucket
bucket = storage_client.get_bucket('spectrain')

## NOTE: This is commented out as the images (png files) for the spectra are already generated now
# # Note: Client.list_blobs requires at least package version 1.17.0.
# blobs = storage_client.list_blobs('spectrain', prefix='Kidney_TX_Data')

# i = 0 # counter to use for breaking

# # Note: The call returns a response only when the iterator is consumed.
# for blob in get_blob(blobs):
#     if("output" in blob.name):
#         write_png_to_gcs(blob, image_dir, bucket)
#         # i = i+1
#         # if(i == 3):
#         #     break;

df=pd.read_csv('gs://spectrain/Kidney_TX_Data/Kidney_TX_data.csv')
df = df.filter(regex=r'^(?!LS|Banff|Biopsy|Source|Patient.S|Nmr)')

# Apply the calculate_eGFR function to create the 'eGFR' column
df['eGFR'] = df.apply(calculate_eGFR, axis=1)
# Apply the calculate_time function to create the 'time.TX' column
df['time.TX'] = df.apply(calculate_time, axis=1)

write_df_to_gcs(df, 'Kidney_TX_enriched_data.csv', bucket)

df.head()

Unnamed: 0,Patient.ID,Patient.Age.at.Biopsy,Patient.Age.at.TX,Case,Sex,serum_creatinine,hippurate,phenylacetylglutamine,trigonellin,urea,...,citrate,dimethylamine,lactate,Diabetes,Hypertension,UA.Pro,UA.Hb,Spectrum_file,eGFR,time.TX
0,bhdLeIiEnep6TPu8qeostZ8o(,55,55,0,male,2.15,0.027913,0.039593,0.012358,3.036788,...,0.040761,0.232653,8.653069,False,False,,,output_NormalizationTool_spectrum_zgpr30-urine...,35.472989,0
1,8agYxL3U(2m0pcw^vomp*Yu9x,56,55,0,female,0.9,0.012763,0.113517,0.040703,5.663656,...,0.105713,0.316157,6.109724,False,True,False,False,output_NormalizationTool_spectrum_zgpr30-urine...,75.030712,1
2,l849CAs#0wR1i(EqkyLtIxwZS,58,58,0,male,1.2,0.122959,0.283221,0.172492,7.417329,...,0.231169,0.262318,17.866997,False,False,False,False,output_NormalizationTool_spectrum_zgpr30-urine...,70.097644,0
3,lw*R@N7LoSHUuxTIxTs$PWzfg,47,45,0,male,1.13,,,,,...,,,,True,True,,,output_NormalizationTool_spectrum_zgpr30-urine...,80.674794,2
4,4oAgnWhMrp$h@B1*Um*PYowCS,22,22,1,female,1.67,0.01828,0.041485,0.010831,3.556433,...,0.010481,0.356004,6.239394,False,False,True,False,output_NormalizationTool_spectrum_zgpr30-urine...,44.147343,0


### Define a Simple LogisticRegression model for our tabular features and label (Case)

We will use the dataframe 'df' formed above to form a simple LogisticRegression model and make some
predictions and evaluate against the testing data and report that

In [99]:
df=pd.read_csv('gs://spectrain/Kidney_TX_enriched_data.csv')
df = df.dropna()

X = df[["serum_creatinine","Sex","hippurate","phenylacetylglutamine",
        "trigonellin","urea","citrate","dimethylamine","lactate",
        "Diabetes","Hypertension","UA.Pro","UA.Hb","eGFR","time.TX"]]
y = df["Case"]


NUMERICAL_COLUMNS = ["serum_creatinine", "hippurate", "phenylacetylglutamine", "trigonellin",
                    "urea", "citrate", "dimethylamine", "lactate", "eGFR", "time.TX"]
CATEGORICAL_COLUMNS = ["Sex", "Diabetes", "Hypertension", "UA.Pro", "UA.Hb"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# pipeline for numerical columns
num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

# # pipeline for categorical columns
# cat_pipe = make_pipeline(
#     SimpleImputer(strategy='constant', fill_value='N/A'),
#     OneHotEncoder(handle_unknown='ignore', sparse=False)
# )

# combine both the pipelines
# full_pipe = ColumnTransformer([
#     ('num', num_pipe, num_cols),
#     ('cat', cat_pipe, cat_cols)
# ])

# # Create input layer
# inputs = create_input_layers()

# # transform
# transformed = transform(inputs)

logreg = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("simple", SimpleImputer(strategy='median'), NUMERICAL_COLUMNS),
            ("ohe", OneHotEncoder(sparse=False), CATEGORICAL_COLUMNS),
            ("scale", StandardScaler(with_mean=True), NUMERICAL_COLUMNS),
        ],
        remainder="passthrough",
    ),
    LogisticRegression(max_iter=10000, random_state=42),
)

# train the model
logreg.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simple',
                                                  SimpleImputer(strategy='median'),
                                                  ['serum_creatinine',
                                                   'hippurate',
                                                   'phenylacetylglutamine',
                                                   'trigonellin', 'urea',
                                                   'citrate', 'dimethylamine',
                                                   'lactate', 'eGFR',
                                                   'time.TX']),
                                                 ('ohe',
                                                  OneHotEncoder(sparse=False),
                                                  ['Sex', 'Diabetes',
                                                   'Hy

Now predict the outcomes for the testing data

In [103]:
predictions = logreg.predict(X_test)

Finally, show the classification report

In [104]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.73      0.96      0.83       197
           1       0.67      0.17      0.27        84

    accuracy                           0.73       281
   macro avg       0.70      0.57      0.55       281
weighted avg       0.71      0.73      0.66       281



### Trying a Keras based deep learning model with a neural network of 3 layers

First dense layer of 16, second dense layer of 8 and the last layer using sigmoid activation. 
The first 2 dense layers would use ReLU as activation function.

In [None]:
# NOTE: input_shape is 20 instead of 15 due to the new columns that will be
# introduced by the OneHotEncoder column transformers for the 5 categorical columns
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(20,)),
    tf.keras.layers.Dense(16, activation=tf.nn.relu),
	tf.keras.layers.Dense(8, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid),
])
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['AUC'])

# NOTE: cannot use the following construct as unable to use .fit with support for epochs and batch_size params
# dnn = make_pipeline(
#     ColumnTransformer(
#         transformers=[
#             ("simple", SimpleImputer(strategy='median'), NUMERICAL_COLUMNS),
#             ("ohe", OneHotEncoder(sparse=False), CATEGORICAL_COLUMNS),
#             ("scale", StandardScaler(with_mean=True), NUMERICAL_COLUMNS),
#         ],
#         remainder="passthrough",
#     ),
#     model,
# )

# [print(i.shape, i.dtype) for i in model.inputs]
# [print(o.shape, o.dtype) for o in model.outputs]
# [print(l.name, l.input_shape, l.dtype) for l in model.layers]
print(type(X_train))

ct = ColumnTransformer(
        transformers=[
            ("simple", SimpleImputer(strategy='median'), NUMERICAL_COLUMNS),
            ("ohe", OneHotEncoder(sparse=False), CATEGORICAL_COLUMNS),
            ("scale", StandardScaler(with_mean=True), NUMERICAL_COLUMNS),
        ],
        remainder="passthrough",
    )

# dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
# train_data = dataset.shuffle(len(X_train)).batch(32)
# train_data = train_data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
 
# valid_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))

# model.fit(train_data, epochs=200, batch_size=4, validation_data=valid_ds)

model.fit(X_train, y_train, epochs=200, batch_size=4)

# test_loss, test_acc = model.evaluate(X_test, y_test)
# print(f"Test Loss: {test_loss} and test accuracy: {test_acc}")

# NOTE: For printing loss and AUC
# test_loss, test_auc = model.evaluate(X_test, y_test)
# print(f"Test Loss: {test_loss} and test AUC: {test_auc}")

# Sample output with 1000 epochs and 32 batch size (0.91 for the loss is less than ideal :( )
# Test Loss: 0.9106149673461914 and test accuracy: 0.7298578023910522

#TODO perhaps try loss, mse instead of loss, accuracy?