# SpecTrain Preprocessing
Review the data to be preprocessed.
There are two types of input data:
1. Clinical features (obtained by current routine methods)
2. NMR features (the spectra file and the measured outputs)


In [32]:
import os
import io

import gcsfs
from google.cloud import bigquery, storage


import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.layers import (
    GRU,
    LSTM,
    RNN,
    Bidirectional,
    Conv1D,
    Dense,
    MaxPool1D,
    Reshape,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from scipy.signal import find_peaks
from scipy.fft import fft

# To plot pretty figures
%matplotlib inline
mpl.rc("axes", labelsize=14)
mpl.rc("xtick", labelsize=12)
mpl.rc("ytick", labelsize=12)

# For reproducible results.
from numpy.random import seed

seed(1)
tf.random.set_seed(2)

In [33]:
PROJECT = !(gcloud config get-value core/project)
PROJECT = PROJECT[0]
%env PROJECT = {PROJECT}
%env BUCKET = spectrain
%env REGION = "us-central1"

env: PROJECT=qwiklabs-asl-00-c812c3b423f2
env: BUCKET=spectrain
env: REGION="us-central1"


In [41]:
# Create a client for interacting with the storage bucket
client = storage.Client()

# Specify your bucket name
bucket_name = 'spectrain'

# Specify the file path in the bucket
file_path = 'Kidney_TX_data_with_split.csv'

In [77]:
# Specify the number of rows to test
num_rows_to_test = 1473  # Adjust this number as needed

# Load the data from the CSV file
bucket = client.get_bucket(bucket_name)
blob = storage.Blob(file_path, bucket)
data = blob.download_as_text()

# Create a DataFrame from the downloaded data, but only select the first few rows
df = pd.read_csv(io.StringIO(data), nrows=num_rows_to_test)

# Define the CKD-EPI equation function
def calculate_eGFR(row):
    if row['Sex'] == 'male':
        kappa = 0.9
        alpha = -0.302
        beta = 1.0
    else:
        kappa = 0.7
        alpha = -0.241
        beta = 1.012

    eGFR = 142 * min(row['serum_creatinine'] / kappa, 1)**alpha * \
           max(row['serum_creatinine'] / kappa, 1)**(-1.2) * \
           0.9938**row['Patient.Age.at.Biopsy'] * beta
    return eGFR

# Apply the function to create the 'eGFR' column
df['eGFR'] = df.apply(calculate_eGFR, axis=1)

#Calculate the time since transplant
def calculate_time(row):
    return row['Patient.Age.at.Biopsy'] - row['Patient.Age.at.TX']
df['time.TX'] = df.apply(calculate_time, axis=1)

# Define the binning ranges for 'eGFR' and 'Time.TX'
eGFR_bins = [float('-inf'), 60, 89, float('inf')]
TimeTX_bins = [float('-inf'), 1, float('inf')]

# Create the binned columns for 'eGFR' and 'Time.TX'
df['eGFR_bin'] = pd.cut(df['eGFR'], bins=eGFR_bins, labels=['<60', '60-89', '>=90'])
df['time.TX_bin'] = pd.cut(df['time.TX'], bins=TimeTX_bins, labels=['<1 year', '>1 year'])

df.head()

Unnamed: 0,Patient.Sample.ID,Patient.ID,Nmr.sample.ID,Patient.Age.at.Biopsy,Patient.Age.at.TX,Sex,serum_creatinine,hippurate,phenylacetylglutamine,trigonellin,...,UA.Pro,UA.Hb,Source,Spectrum_file,data_split,Case,eGFR,time.TX,eGFR_bin,time.TX_bin
0,SR@X(tihcCByqQolw3t#9XMfw,HGhMGDF82QUfedfP*Ib09Fz,SR@X(tihcCByqQolw3t#9XMfw,65,64,male,1.6385,0.380197,0.589546,0.109758,...,False,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,0,46.182573,1,<60,<1 year
1,78sYbu)gHhtAo0GJG8sO2^pM9,kteep*8yeXS8ERTtII283S$87,78sYbu)gHhtAo0GJG8sO2^pM9,66,65,male,1.8645,0.024277,0.050885,0.005237,...,True,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,0,39.304112,1,<60,<1 year
2,4CHo^pPe^22nn1hZjcvtS25Tb,HPQz3eY1qku4WE@oq1F031oKj,4CHo^pPe^22nn1hZjcvtS25Tb,60,58,male,2.0114,0.344184,0.10118,0.110925,...,False,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,0,37.249527,2,<60,>1 year
3,meJ9sob3UzaM2xIkiK(ulzN1Y,mbvaunPwjME9dC)eHjbJR3sU6,meJ9sob3UzaM2xIkiK(ulzN1Y,50,49,male,0.91,0.048538,0.121817,0.040534,...,True,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,1,102.67845,1,>=90,<1 year
4,fVuk(N7Oll$RpWEGCddnvLnKI,(bfTo8I0LAbtSFehXaBP4Bz9Q,fVuk(N7Oll$RpWEGCddnvLnKI,34,32,male,1.57,0.007928,0.083114,0.024023,...,True,False,numares,output_NormalizationTool_spectrum_zgpr30-urine...,TRAIN,1,58.947517,2,<60,>1 year


In [78]:
output_bucket_name = "spectrain"  # Replace with your bucket name

output_file_paths = {
    'TEST': 'split_1D/test_data.csv',
    'TRAIN': 'split_1D/train_data.csv',
    'VALIDATE': 'split_1D/validate_data.csv'
}

# Specify the ppm intervals
ppm_intervals = np.arange(0.8, 8.8, 0.01)

# Iterate over each value of data_split and create separate CSV files
for data_split_value, output_file_path in output_file_paths.items():
    # Select rows for the current data_split value
    df_split = df[df['data_split'] == data_split_value].copy()
    
    # Create columns for each ppm interval
    for ppm_start, ppm_end in zip(ppm_intervals[:-1], ppm_intervals[1:]):
        column_name = f'ppm_{ppm_start:.2f}'
        df_split[column_name] = np.nan
    
    # Iterate over the rows and populate the ppm interval columns
    for index, row in df_split.iterrows():
        file_name = row['Spectrum_file']
        blob = storage.Blob(f"{spectrum_directory}/{file_name}", bucket)
        spectrum_data = blob.download_as_text()
        spectrum_df = pd.read_csv(io.StringIO(spectrum_data), sep=' ', header=None)
        spectrum_df.columns = ['ppm', 'y_value']
        
        # Filter the data between 0.8 and 8.8 ppm
        filtered_spectrum_df = spectrum_df[(spectrum_df['ppm'] >= 0.8) & (spectrum_df['ppm'] <= 8.8)]
        
        # Condense the data by averaging 'y_value' for every 0.01 ppm
        condensed_spectrum_df = filtered_spectrum_df.groupby(np.round(filtered_spectrum_df['ppm'], 2))['y_value'].mean().reset_index()
        
        # Fill in the ppm interval columns with the condensed spectrum data
        for ppm_start, ppm_end in zip(ppm_intervals[:-1], ppm_intervals[1:]):
            column_name = f'ppm_{ppm_start:.2f}'
            interval_data = condensed_spectrum_df[(condensed_spectrum_df['ppm'] >= ppm_start) & (condensed_spectrum_df['ppm'] < ppm_end)]['y_value']
            df_split.at[index, column_name] = interval_data.mean() if not interval_data.empty else np.nan
    
    # Select only the desired columns
    desired_columns = ['Case'] + [f'ppm_{ppm_start:.2f}' for ppm_start in ppm_intervals[:-1]]
    df_split = df_split[desired_columns]
    
    # Save the resulting DataFrame to a CSV file in the bucket
    blob = storage.Blob(output_file_path, bucket=bucket)
    blob.upload_from_string(df_split.to_csv(index=False), content_type='text/csv')





In [54]:
array_shape = tensor_data.shape
print(array_shape)


(10, 801)


In [79]:
N_TIME_STEPS = 799
N_LABELS = 2
LABEL = 'Case'

Xtrain = pd.read_csv("gs://spectrain/split_1D/train_data.csv")
Xvalid = pd.read_csv("gs://spectrain/split_1D/validate_data.csv")

ytrain = Xtrain.pop(LABEL)
yvalid = Xvalid.pop(LABEL)

#print(ytrain)
# Check if both classes are present in the datasets
#if np.unique(ytrain).size < 2 or np.unique(yvalid).size < 2:
  #  raise ValueError("Both classes should be present in the datasets.")
    
ytrain_categorical = to_categorical(ytrain.values, num_classes=2)
yvalid_categorical = to_categorical(yvalid.values, num_classes=2)

In [85]:

model = Sequential()

# Convolutional layer
model.add(Reshape(target_shape=[N_TIME_STEPS, 1]))
model.add(
    Conv1D(
        filters=128,
        kernel_size=64,
        strides=8,
        padding="valid",
        input_shape=[None, 1],
    )
)
model.add(MaxPool1D(pool_size=2, strides=None, padding="valid"))


# Flatten the result and pass through DNN.
model.add(tf.keras.layers.Flatten())
model.add(Dense(units=N_TIME_STEPS // 4, activation="relu"))

model.add(
    Dense(
        units=N_LABELS,
        activation="softmax",
        kernel_regularizer=tf.keras.regularizers.l1(l=0.1),
    )
)

model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

history = model.fit(
    x=Xtrain.values,
    y=ytrain_categorical,
    batch_size=1000,
    validation_data=(Xvalid.values, yvalid_categorical),
    epochs=100,
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [82]:
def plot_curves(train_data, val_data, label="Accuracy"):
    """Plot training and validation metrics on single axis.

    Args:
        train_data: list, metrics obtrained from training data.
        val_data: list, metrics obtained from validation data.
        label: str, title and label for plot.
    Returns:
        Matplotlib plot.
    """
    plt.plot(
        np.arange(len(train_data)) + 0.5,
        train_data,
        "b.-",
        label="Training " + label,
    )
    plt.plot(
        np.arange(len(val_data)) + 1,
        val_data,
        "r.-",
        label="Validation " + label,
    )
    plt.gca().xaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))
    plt.legend(fontsize=14)
    plt.xlabel("Epochs")
    plt.ylabel(label)
    plt.grid(True)

In [83]:
plot_curves(history.history["loss"], history.history["val_loss"], label="Loss")

KeyError: 'val_loss'

In [84]:
plot_curves(
    history.history["accuracy"],
    history.history["val_accuracy"],
    label="Accuracy",
)

KeyError: 'val_accuracy'

In [None]:
np.mean(history.history["val_accuracy"][-5:])