# SpecTrain Preprocessing
Review the data to be preprocessed.
There are two types of input data:
1. Clinical features (obtained by current routine methods)
2. NMR features (the spectra file and the measured outputs)


In [42]:
import os
import io

import gcsfs
from google.cloud import bigquery, storage


import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.layers import (
    GRU,
    LSTM,
    RNN,
    Bidirectional,
    Conv1D,
    Dense,
    MaxPool1D,
    Reshape,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from scipy.signal import find_peaks
from scipy.fft import fft

# To plot pretty figures
%matplotlib inline
mpl.rc("axes", labelsize=14)
mpl.rc("xtick", labelsize=12)
mpl.rc("ytick", labelsize=12)

# For reproducible results.
from numpy.random import seed

seed(1)
tf.random.set_seed(2)

In [2]:
PROJECT = !(gcloud config get-value core/project)
PROJECT = PROJECT[0]
%env PROJECT = {PROJECT}
%env BUCKET = spectrain
%env REGION = "us-central1"

env: PROJECT=qwiklabs-asl-00-c812c3b423f2
env: BUCKET=spectrain
env: REGION="us-central1"


In [19]:
df=pd.read_csv('gs://spectrain/Kidney_TX_data_with_split.csv')
df = df.filter(regex=r'^(?!LS|Banff|Biopsy|Source|Patient.S|Nmr)')
df.head()

Unnamed: 0,Patient.ID,Patient.Age.at.Biopsy,Patient.Age.at.TX,Sex,serum_creatinine,hippurate,phenylacetylglutamine,trigonellin,urea,alanine,citrate,dimethylamine,lactate,Diabetes,Hypertension,UA.Pro,UA.Hb,Spectrum_file,data_split,Case
0,HGhMGDF82QUfedfP*Ib09Fz,65,64,male,1.6385,0.380197,0.589546,0.109758,11.916547,1.175961,0.254771,0.569348,33.23558,False,True,False,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,0
1,kteep*8yeXS8ERTtII283S$87,66,65,male,1.8645,0.024277,0.050885,0.005237,3.940824,1.059415,0.042319,0.335958,17.41513,True,True,True,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,0
2,HPQz3eY1qku4WE@oq1F031oKj,60,58,male,2.0114,0.344184,0.10118,0.110925,8.028329,1.063058,0.027348,0.353942,12.433237,False,True,False,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,0
3,mbvaunPwjME9dC)eHjbJR3sU6,50,49,male,0.91,0.048538,0.121817,0.040534,7.225312,1.063598,0.092649,0.357289,17.054968,False,True,True,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,1
4,(bfTo8I0LAbtSFehXaBP4Bz9Q,34,32,male,1.57,0.007928,0.083114,0.024023,6.785139,1.144877,0.048738,0.590268,50.600055,False,True,True,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,1


## Standard Clinical Features to Engineer
Initial modeling found serum creatinine, urine protein, age and sex to be useful factors.
- Protein is a logical
- Serum creatinine should be crossed with age and sex using the CKD-EPI equation
- Time since transplant should be calculated based on the difference between the age at biopsy and age at transplant
- Time since transplant could be bucketed into <1yr, 1 to 3yr and >3yr
- eGFR should be bucketed at <60 vs. >60

In [20]:
# Define the CKD-EPI equation function
def calculate_eGFR(row):
    if row['Sex'] == 'male':
        kappa = 0.9
        alpha = -0.302
        beta = 1.0
    else:
        kappa = 0.7
        alpha = -0.241
        beta = 1.012

    eGFR = 142 * min(row['serum_creatinine'] / kappa, 1)**alpha * \
           max(row['serum_creatinine'] / kappa, 1)**(-1.2) * \
           0.9938**row['Patient.Age.at.Biopsy'] * beta
    return eGFR

# Apply the function to create the 'eGFR' column
df['eGFR'] = df.apply(calculate_eGFR, axis=1)

#Calculate the time since transplant
def calculate_time(row):
    return row['Patient.Age.at.Biopsy'] - row['Patient.Age.at.TX']

df['time.TX'] = df.apply(calculate_time, axis=1)

df.describe()

Unnamed: 0,Patient.Age.at.Biopsy,Patient.Age.at.TX,serum_creatinine,hippurate,phenylacetylglutamine,trigonellin,urea,alanine,citrate,dimethylamine,lactate,Case,eGFR,time.TX
count,1474.0,1474.0,1474.0,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0,1474.0,1474.0,1474.0
mean,53.753731,51.158073,1.764743,0.073352,0.151719,0.057783,7.207816,1.082696,0.141861,0.371615,29.060735,0.297151,51.736133,2.595658
std,13.964644,14.496205,1.03814,0.078465,0.115962,0.048519,2.641276,0.077893,0.153911,0.129915,59.955553,0.457159,22.322959,3.744288
min,18.0,2.0,0.4068,0.004596,0.013141,0.002438,0.47995,1.0,0.003193,0.061405,1.687463,0.0,2.996631,0.0
25%,43.0,41.0,1.1865,0.025824,0.079561,0.023641,5.348111,1.043094,0.029498,0.301052,10.225248,0.0,35.766417,1.0
50%,55.0,53.0,1.4908,0.048538,0.121817,0.042469,6.957964,1.063598,0.092649,0.347711,17.013275,0.0,50.374009,1.0
75%,65.0,62.0,1.97,0.086982,0.187649,0.07663,8.786015,1.092219,0.200019,0.406126,27.661886,1.0,64.922595,2.0
max,86.0,83.0,14.24,0.744762,1.697553,0.342581,22.924218,2.369717,1.2025,2.034652,1583.214971,1.0,126.173805,34.0


## Create a list of "peaks" based on the NMR data

In [34]:
#Make a functino to count peaks

def find_peaks_for_files(bucket_name, df):
    # List to store the peak values
    peaks_list = []

    # Iterate over the files in the df DataFrame
    for filename in df['Spectrum_file']:
        # Construct the GCS file path
        file_path = f"gs://{bucket_name}/{filename}"
        
        # Read the file into a DataFrame
        file_df = pd.read_csv(file_path, sep='\s', header=None)
        file_df.columns = ['x_axis', 'y_axis']
        
        # Filter the DataFrame for x_axis values between 0.1 and 9.3
        filtered_df = file_df[(file_df['x_axis'] >= 0.1) & (file_df['x_axis'] <= 9.3)].reset_index(drop=True)
        
        # Find the peaks in the filtered y_axis column
        peaks, _ = find_peaks(filtered_df['y_axis'], distance = 7, height = 120, prominence = 60)
        peak_values = filtered_df.loc[peaks, ['x_axis', 'y_axis']]
        
        # Append the peak values to the list
        peaks_list.append(peak_values)

    # Add the peaks_list to the df DataFrame
    df['peaks'] = peaks_list
    return df



In [33]:
bucket_name = 'spectrain/Kidney_TX_Data'
# Create a client to interact with the GCS bucket
client = storage.Client()
df = find_peaks_for_files(bucket_name, df)


  return func(*args, **kwargs)


In [30]:
df.head()

Unnamed: 0,Patient.ID,Patient.Age.at.Biopsy,Patient.Age.at.TX,Sex,serum_creatinine,hippurate,phenylacetylglutamine,trigonellin,urea,alanine,...,Diabetes,Hypertension,UA.Pro,UA.Hb,Spectrum_file,data_split,Case,eGFR,time.TX,peaks
0,HGhMGDF82QUfedfP*Ib09Fz,65,64,male,1.6385,0.380197,0.589546,0.109758,11.916547,1.175961,...,False,True,False,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,0,46.182573,1,x_axis y_axis 4 9.294060 2...
1,kteep*8yeXS8ERTtII283S$87,66,65,male,1.8645,0.024277,0.050885,0.005237,3.940824,1.059415,...,True,True,True,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,0,39.304112,1,x_axis y_axis 5 9.299160 2...
2,HPQz3eY1qku4WE@oq1F031oKj,60,58,male,2.0114,0.344184,0.10118,0.110925,8.028329,1.063058,...,False,True,False,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,0,37.249527,2,x_axis y_axis 2 9.297420 2...
3,mbvaunPwjME9dC)eHjbJR3sU6,50,49,male,0.91,0.048538,0.121817,0.040534,7.225312,1.063598,...,False,True,True,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,1,102.67845,1,x_axis y_axis 3 9.298130 1...
4,(bfTo8I0LAbtSFehXaBP4Bz9Q,34,32,male,1.57,0.007928,0.083114,0.024023,6.785139,1.144877,...,False,True,True,False,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,1,58.947517,2,x_axis y_axis 1 9.298630 4...


### Create new csv files of the fourier transform

In [68]:
def get_blob(blobs):
    for blob in blobs:
        yield blob
    
def write_fft_to_gcs(blob, bucket):
    print(blob.name)
    #NOTE: important to use pyplot instantiation this way to ensure no memory leaks
    fig = plt.figure(num=1,figsize=(300,40), clear=True)
    data = blob.download_as_bytes()
    file_df = pd.read_csv(io.BytesIO(data), sep='\s', header=None)
    file_df.columns=['x_axis', 'y_axis']
    filtered_df = file_df[(file_df['x_axis'] >= 0.1) & (file_df['x_axis'] <= 9.3)]
    
    x_axis = filtered_df['x_axis']
    y_axis = filtered_df['y_axis']
    fft_y = fft(y_axis.values)
    
    ax = fig.add_subplot()
    ax.plot(x_axis, np.abs(fft_y))
    ax.axis('off')
    
    buf = io.BytesIO()
    fig.savefig(buf, format='png')

    filename = blob.name.split('.')[0] # remove the suffix/file extension
    filename = filename.split('/')[1] # remove the containing directory name from filename

    upload_blob = bucket.blob(image_dir+filename+'_fft.png')
    upload_blob.upload_from_file(buf, content_type='image/png', rewind=True)

    # plt.close()
    buf.close()
    del file_df
    del filtered_df
    # del plt
    del data
    del upload_blob
    

In [None]:
image_dir='spec_train_output/fft_images/'
# plt.figure(figsize=(300,40))

# initialize the GCS client
storage_client = storage.Client()

# get the storage bucket
bucket = storage_client.get_bucket('spectrain')

# Note: Client.list_blobs requires at least package version 1.17.0.
blobs = storage_client.list_blobs('spectrain', prefix='Kidney_TX_Data')

i = 1 # counter to use for breaking

# Note: The call returns a response only when the iterator is consumed.
for blob in get_blob(blobs):
    if("output" in blob.name):
        write_fft_to_gcs(blob, bucket)
        #i = i+1
        #if(i == 3):
           # break;

Kidney_TX_Data/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_571_00000_withoutBackground_20001.txt
Kidney_TX_Data/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_572_00000_withoutBackground_20001.txt
Kidney_TX_Data/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_573_00000_withoutBackground_20001.txt
Kidney_TX_Data/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_574_00000_withoutBackground_20001.txt
Kidney_TX_Data/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_575_00000_withoutBackground_20001.txt
Kidney_TX_Data/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_576_00000_withoutBackground_20001.txt
Kidney_TX_Data/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_577_00000_withoutBackground_20001.txt
Kidney_TX_Data/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_578_00000_withoutBackground_20001.txt
Kidney_TX_Data/output_NormalizationTool_spectrum_zgpr30-urine-600MHz-310K_12_579

KeyboardInterrupt: 

# Upscale the cases 

In [None]:
#Upsample the cases in the training and validation sets