# SpecTrain Preprocessing
Review the data to be preprocessed.
There are two types of input data:
1. Clinical features (obtained by current routine methods)
2. NMR features (the spectra file and the measured outputs)


In [2]:
import os
import io

import gcsfs
from google.cloud import bigquery, storage


import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.layers import (
    GRU,
    LSTM,
    RNN,
    Bidirectional,
    Conv1D,
    Dense,
    MaxPool1D,
    Reshape,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# To plot pretty figures
%matplotlib inline
mpl.rc("axes", labelsize=14)
mpl.rc("xtick", labelsize=12)
mpl.rc("ytick", labelsize=12)

# For reproducible results.
from numpy.random import seed

seed(1)
tf.random.set_seed(2)

In [3]:
PROJECT = !(gcloud config get-value core/project)
PROJECT = PROJECT[0]
%env PROJECT = {PROJECT}
%env BUCKET = spectrain
%env REGION = "us-central1"

env: PROJECT=qwiklabs-asl-00-c812c3b423f2
env: BUCKET=spectrain
env: REGION="us-central1"


In [4]:
df=pd.read_csv('gs://qwiklabs-asl-00-c812c3b423f2/spec_train_output/input/Kidney_TX_data.csv')
df = df.filter(regex=r'^(?!LS|Banff|Biopsy|Source|Spectrum|Patient.S|Nmr)')
df.head()

Unnamed: 0,Patient.ID,Patient.Age.at.Biopsy,Patient.Age.at.TX,Case,Sex,serum_creatinine,hippurate,phenylacetylglutamine,trigonellin,urea,alanine,citrate,dimethylamine,lactate,Diabetes,Hypertension,UA.Pro,UA.Hb
0,bhdLeIiEnep6TPu8qeostZ8o(,55,55,0,male,2.15,0.027913,0.039593,0.012358,3.036788,1.038175,0.040761,0.232653,8.653069,False,False,,
1,8agYxL3U(2m0pcw^vomp*Yu9x,56,55,0,female,0.9,0.012763,0.113517,0.040703,5.663656,1.025761,0.105713,0.316157,6.109724,False,True,False,False
2,l849CAs#0wR1i(EqkyLtIxwZS,58,58,0,male,1.2,0.122959,0.283221,0.172492,7.417329,1.070371,0.231169,0.262318,17.866997,False,False,False,False
3,lw*R@N7LoSHUuxTIxTs$PWzfg,47,45,0,male,1.13,,,,,,,,,True,True,,
4,4oAgnWhMrp$h@B1*Um*PYowCS,22,22,1,female,1.67,0.01828,0.041485,0.010831,3.556433,1.029425,0.010481,0.356004,6.239394,False,False,True,False


## Standard Clinical Features to Engineer
Initial modeling found serum creatinine, urine protein, age and sex to be useful factors.
- Protein is a logical
- Serum creatinine should be crossed with age and sex using the CKD-EPI equation
- Time since transplant should be calculated based on the difference between the age at biopsy and age at transplant
- Time since transplant could be bucketed into <1yr, 1 to 3yr and >3yr
- eGFR should be bucketed at <60 vs. >60

In [5]:
# Define the CKD-EPI equation function
def calculate_eGFR(row):
    if row['Sex'] == 'male':
        kappa = 0.9
        alpha = -0.302
        beta = 1.0
    else:
        kappa = 0.7
        alpha = -0.241
        beta = 1.012

    eGFR = 142 * min(row['serum_creatinine'] / kappa, 1)**alpha * \
           max(row['serum_creatinine'] / kappa, 1)**(-1.2) * \
           0.9938**row['Patient.Age.at.Biopsy'] * beta
    return eGFR

# Apply the function to create the 'eGFR' column
df['eGFR'] = df.apply(calculate_eGFR, axis=1)

#Calculate the time since transplant
def calculate_time(row):
    return row['Patient.Age.at.Biopsy'] - row['Patient.Age.at.TX']

df['time.TX'] = df.apply(calculate_time, axis=1)
df.describe()

Unnamed: 0,Patient.Age.at.Biopsy,Patient.Age.at.TX,Case,serum_creatinine,hippurate,phenylacetylglutamine,trigonellin,urea,alanine,citrate,dimethylamine,lactate,eGFR,time.TX
count,1474.0,1474.0,1474.0,1474.0,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0,1474.0,1474.0
mean,53.753731,51.158073,0.297151,1.764743,0.073352,0.151719,0.057783,7.207816,1.082696,0.141861,0.371615,29.060735,51.736133,2.595658
std,13.964644,14.496205,0.457159,1.03814,0.078465,0.115962,0.048519,2.641276,0.077893,0.153911,0.129915,59.955553,22.322959,3.744288
min,18.0,2.0,0.0,0.4068,0.004596,0.013141,0.002438,0.47995,1.0,0.003193,0.061405,1.687463,2.996631,0.0
25%,43.0,41.0,0.0,1.1865,0.025824,0.079561,0.023641,5.348111,1.043094,0.029498,0.301052,10.225248,35.766417,1.0
50%,55.0,53.0,0.0,1.4908,0.048538,0.121817,0.042469,6.957964,1.063598,0.092649,0.347711,17.013275,50.374009,1.0
75%,65.0,62.0,1.0,1.97,0.086982,0.187649,0.07663,8.786015,1.092219,0.200019,0.406126,27.661886,64.922595,2.0
max,86.0,83.0,1.0,14.24,0.744762,1.697553,0.342581,22.924218,2.369717,1.2025,2.034652,1583.214971,126.173805,34.0


In [7]:
def get_blob(blobs):
    for blob in blobs:
        yield blob

def write_png_to_gcs(blob, bucket):
    print(blob.name)
    #NOTE: important to use pyplot instantiation this way to ensure no memory leaks
    fig = plt.figure(num=1,figsize=(480,48), clear=True)
    data = blob.download_as_bytes()
    df = pd.read_csv(io.BytesIO(data), sep='\s', header=None)
    # df=pd.read_csv(blob.name, sep='\s', header=None)
    # filename=filename.split('.')[0]
    df.columns=['x_axis', 'y_axis']
    df = df[(df['x_axis'] >= 0.1) & (df['x_axis'] <= 9.3)]
    ax = fig.add_subplot()
    ax.plot(df['x_axis'], df['y_axis'])
    ax.axis('off')

    buf = io.BytesIO()
    fig.savefig(buf, format='png')

    filename = blob.name.split('.')[0] # remove the suffix/file extension
    filename = filename.split('/')[1] # remove the containing directory name from filename

    upload_blob = bucket.blob(image_dir+filename+'_nmr.png')
    upload_blob.upload_from_file(buf, content_type='image/png', rewind=True)

    # plt.close()
    buf.close()
    del df
    # del plt
    del data
    del upload_blob
    
    return

In [None]:
image_dir='spec_train_output/processed_images/'
# plt.figure(figsize=(300,40))

# initialize the GCS client
storage_client = storage.Client()

# get the storage bucket
bucket = storage_client.get_bucket('spectrain')

# Note: Client.list_blobs requires at least package version 1.17.0.
blobs = storage_client.list_blobs('spectrain', prefix='Kidney_TX_Data')

i = 0 # counter to use for breaking

# Note: The call returns a response only when the iterator is consumed.
for blob in get_blob(blobs):
    if("output" in blob.name):
        write_png_to_gcs(blob, bucket)
        #i = i+1
        #if(i == 3):
        #     break;


In [181]:
# Create data split column for images data import file
data_split_df=pd.read_csv('gs://spectrain/Kidney_TX_data_with_split.csv')
image_paths_df=pd.read_csv('gs://spectrain/spec_train_output/image_dir_paths_labels1.csv', header=None)
data_split_df['Spectrum_file_new'] = data_split_df.Spectrum_file.str.split('\.').str[0]
data_split_df['Spectrum_file_new'] = "gs://spectrain/spec_train_output/images/" + data_split_df['Spectrum_file_new'] + '_nmr.png'
data_split_df = data_split_df[['Spectrum_file_new', 'data_split']]
image_paths_df.columns=['Spectrum_file_new', 'Case']
image_paths_df=pd.merge(image_paths_df, data_split_df, on=['Spectrum_file_new'], how='inner')
image_paths_df=image_paths_df[['data_split', 'Spectrum_file_new', 'Case']]
image_paths_df['data_split'] = image_paths_df.data_split.replace({'TRAIN':'TRAINING', 'VALID':'VALIDATION', 'TEST':'TEST'})
image_paths_df.to_csv('gs://spectrain/spec_train_output/image_dir_paths_labels_with_split.csv', header=None, index=False)