In [66]:
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pandas as pd
import os
import io
from sklearn.model_selection import train_test_split
import gcsfs
from google.cloud import bigquery, storage
from google.oauth2 import credentials

In [None]:
def get_blob(blobs):
    for blob in blobs:
        yield blob

In [None]:
## This code is to create images from .txt files
image_dir='images/'
# plt.figure(figsize=(300,40))

# initialize the GCS client
storage_client = storage.Client()

# get the storage bucket
bucket = storage_client.get_bucket('spectrain')

# Note: Client.list_blobs requires at least package version 1.17.0.
blobs = storage_client.list_blobs('spectrain', prefix='Kidney_TX_Data')


# Note: The call returns a response only when the iterator is consumed.
for blob in get_blob(blobs):
    if("output" in blob.name):
        plt.figure(figsize=(300,40), clear=True)
        df=pd.read_csv("gs://spectrain/"+blob.name, sep='\s', header=None)
        df.columns=['x_axis', 'y_axis']
        plt.plot(df['x_axis'], df['y_axis'])
        plt.axis('off')
        buf = io.BytesIO()
        plt.savefig(buf, format='png')
        filename = blob.name.split('.')[0] # remove the suffix/file extension
        filename = filename.split('/')[1] # remove the containing directory name from filename
        upload_blob = bucket.blob(image_dir+filename+'_nmr.png')
        upload_blob.upload_from_file(buf, content_type='image/png', rewind=True)
        buf.close()
        plt.close("all")
        plt.close()

In [None]:
# Create import file for AutoML IMAGE CLASSIFICATION

df=pd.read_csv('gs://qwiklabs-asl-00-c812c3b423f2/spec_train_output/input/Kidney_TX_data.csv')
df['Spectrum_file_new'] = df.Spectrum_file.str.split('\.').str[0]
df['Spectrum_file_new'] = "gs://spectrain/spec_train_output/images/" + df['Spectrum_file_new'] + '_nmr.png'
df = df[['Spectrum_file_new', 'Case']]
df.to_csv('gs://spectrain/spec_train_output/image_dir_paths_labels1.csv', index=False, header=None)

In [148]:
# Create data split column for structured data

df=pd.read_csv('gs://qwiklabs-asl-00-c812c3b423f2/spec_train_output/input/Kidney_TX_data.csv')
X=df.drop(columns=['Case'])
y=df[['Case']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)
X_train['data_split'] = 'TRAIN'
X_val['data_split'] = 'VALID'
X_test['data_split'] = 'TEST'
X_train['Case'] = y_train
X_val['Case'] = y_val
X_test['Case'] = y_test
final_df=pd.concat([X_train, X_val, X_test])
final_df.to_csv('gs://spectrain/Kidney_TX_data_with_split.csv', index=False)

In [181]:
# Create data split column for images data import file
data_split_df=pd.read_csv('gs://spectrain/Kidney_TX_data_with_split.csv')
image_paths_df=pd.read_csv('gs://spectrain/spec_train_output/image_dir_paths_labels1.csv', header=None)
data_split_df['Spectrum_file_new'] = data_split_df.Spectrum_file.str.split('\.').str[0]
data_split_df['Spectrum_file_new'] = "gs://spectrain/spec_train_output/images/" + data_split_df['Spectrum_file_new'] + '_nmr.png'
data_split_df = data_split_df[['Spectrum_file_new', 'data_split']]
image_paths_df.columns=['Spectrum_file_new', 'Case']
image_paths_df=pd.merge(image_paths_df, data_split_df, on=['Spectrum_file_new'], how='inner')
image_paths_df=image_paths_df[['data_split', 'Spectrum_file_new', 'Case']]
image_paths_df['data_split'] = image_paths_df.data_split.replace({'TRAIN':'TRAINING', 'VALID':'VALIDATION', 'TEST':'TEST'})
image_paths_df.to_csv('gs://spectrain/spec_train_output/image_dir_paths_labels_with_split.csv', header=None, index=False)