In [None]:
import warnings, os, random, pickle
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
# from tqdm import tqdm
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.layers import LSTM, Dense
from util import timeseries_generator

# Set the random seeds
os.environ['TF_CUDNN_DETERMINISTIC']='1'
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
import tensorflow as tf
tf.random.set_seed(hash("by removing stochasticity") % 2**32 - 1)

In [None]:
file_name = "./GSE/GSE6186_cleaned"
file = f"{file_name}.csv"
df = pd.read_csv(file, index_col=0, parse_dates=True)
# df = pd.read_csv(file, index_col=0, parse_dates=True).iloc[:, 0:10]

In [None]:
kmeans = pickle.load(open("3means.pkl", "rb"))
data = kmeans.cluster_centers_.T
data = pd.DataFrame(data=data, index=df.index)

In [None]:
test_size = 1
time_steps = 1
b_size = 1

In [None]:
def compile_fit_predict(gene, k):
    now = datetime.now().strftime("%Y%m%d-%H%M%S")
    # logdir = f'logs/{now}'
    logdir = f'logs'
    os.makedirs(logdir, exist_ok=True)
    gene = gene.reshape(-1, 1)
    gene = gene.astype('float32')
    
    scaler = MinMaxScaler(feature_range=(-1, 1))
    gene = scaler.fit_transform(gene)
    # print(gene)

    
    X, y = timeseries_generator(pd.DataFrame(gene), time_steps, b_size)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    
    """ # print(X_train)
    # print(y_train)
    # print(X_test)
    # print(y_test)
    # print(X_train.shape, y_train.shape)
    # print(X_test.shape, y_test.shape) """
    n_features = 1
    n_input = time_steps
    input_shape=(n_input, n_features)
    model = Sequential([
        LSTM(64, activation='relu', input_shape=input_shape),
        Dense(1)
    ])
    # rmse: 0.434
    model.compile(optimizer='adam', loss='mse')
    
    callbacks = [
        # TensorBoard(log_dir=logdir,update_freq=1,histogram_freq=1,write_graph=True,write_images=True),
        ModelCheckpoint(f'{logdir}/best_model.h5', monitor='loss', save_best_only=True, mode='min')
    ]
    model.fit(X_train, y_train, batch_size=100, epochs=100, validation_data=(X_test, y_test), callbacks=callbacks, verbose=0, shuffle=False)
    model = load_model(filepath=f'{logdir}/best_model.h5')



    # KMEANS
    cluster_pred = kmeans.predict(df.T)
    _df = df.iloc[:, np.where(cluster_pred == k)[0]]
    # x = df.values
    RMSE = []
    for col in _df.columns:
        _df[col] = _df[col].astype('float32')
        _X, _y = timeseries_generator(_df[col], time_steps, b_size)
        _X_train, _X_test, _y_train, _y_test = train_test_split(_X, _y, test_size=test_size, shuffle=False)

        y_predicted = model(_X_test)
        # print(y_test.shape, y_predicted.shape)
        y_predicted = scaler.inverse_transform(y_predicted)
        _y_test = _y_test.reshape(-1, 1)
        rmse = mean_squared_error(_y_test, y_predicted, squared=False)
        RMSE.append(rmse)
    rmse = np.mean(RMSE)

    del model
    return rmse

In [None]:
RMSEs = []
# for col in tqdm(df.columns):
for k, col in enumerate(data.columns):
    gene = np.array(data[col])
    rmse = compile_fit_predict(gene, k)
    print(f'rmse for gene {col}: rmse')
    RMSEs.append(rmse)
    print('so far rmse: %.3f' % np.average(RMSEs))
print('rmse: %.3f' % np.average(RMSEs))