This notebook contains functionality to perform the following:

Implementing and testing XGBoost functionality trained on the 1/16th arabidopsis (i)starr dataset.

In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Shareddrives/NRC_Amii_Agronomics_Project/nrc-ml-plant-genomics


In [None]:
import keras
import warnings, logging
import json
import numpy as np
import pandas as pd
import datetime, os

from keras.models import Sequential, load_model, model_from_json
# from keras.layers import Input, Dense, Conv1D, MaxPooling2D, Dropout, Flatten, BatchNormalization
from keras.callbacks import ModelCheckpoint, EarlyStopping  # https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
import tensorflow as tf

from keras import layers
from keras import activations

import xgboost

from sklearn.metrics import r2_score
from scipy.stats import spearmanr # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html

warnings.filterwarnings('ignore')
logging.disable(1000)

%load_ext tensorboard

In [None]:
args = {"data_path":"data/processed/arabidopsis_every_16.csv",
        "include_mononuc_freq":1,
        "include_dinuc_freq":0,
        "include_trinuc_freq":0,
        "include_starr":1,
        "include_istarr":0,
        "shuffle":0}

nts = ["A", "T", "C", "G"]  # list of single nucleotides

mapping = {"A": [1.0, 0.0, 0.0, 0.0], "T": [0.0, 0.0, 0.0, 1.0], "C": [0.0, 1.0, 0.0, 0.0], "G": [0.0, 0.0, 1.0, 0.0]}  # cross referenced with kipoi data loader

In [None]:
def get_model(args):
  return xgboost.XGBRegressor()


def get_ohe(sequence):  # gets sequence in format model can use (145, 4)
    return np.array([mapping[nt] for nt in sequence])


def train_test_val(args):
    include = []  # captures all sequences we are including as input features
    if args["include_mononuc_freq"] == 1:  include += nts
    if args["include_dinuc_freq"] == 1:    include += [nt1+nt2 for nt1 in nts for nt2 in nts]
    if args["include_trinuc_freq"] == 1:   include += [nt1+nt2+nt3 for nt1 in nts for nt2 in nts for nt3 in nts]

    df = pd.read_csv(args["data_path"])
    print("read csv")

    # freqs = []  # to hold the frequencies of each sequence  [df.shape[0], len(include)]
    # for i in range(df.shape[0]):
    #     sqnc_freq = []
    #     for item in include:  # for each sequence we care about, count how often it appears
    #         sqnc_freq.append(df.loc[i, "sequence"].count(item))
    #     freqs.append(sqnc_freq)
    # df[include] = freqs  # creates new columns in dataframe

    for item in include:  # create new columns with the counts of sequences in "include"
      print("including", item)
      df[item] = df.sequence.str.count(item)

    target = "target_istarr" if "include_starr" == 1 else "target_starr"

    train_df = df[df.set == "train"]
    X_train = np.array(train_df[include])
    y_train = np.array(train_df[target].tolist())

    val_df = df[df.set == "val"]
    X_val = np.array(val_df[include])
    y_val = np.array(val_df[target].tolist())

    test_df = df[df.set == "test"]
    X_test = np.array(test_df[include])
    y_test = np.array(test_df[target].tolist())

    return X_train, y_train, X_val, y_val, X_test, y_test


def return_y(args, df):  # based on what to include, returns y array
    if args["include_starr"]:
      if args["include_istarr"]:
        y = np.array(pd.concat([df["target_starr"], df["target_istarr"]], axis=1))
      else:
        y = np.array(df["target_starr"].tolist())
    else:
      y = np.array(df["target_istarr"].tolist())

    return y


def save_results(args, dir_path, X_train, X_test, X_val, y_train, y_test, y_val, saved_model):
    if args["include_starr"] + args["include_istarr"] == 2:

      train_predictions = saved_model.predict(X_train)
      val_predictions = saved_model.predict(X_val)
      test_predictions = saved_model.predict(X_test)

      starr_scores = [[str(r2_score(y_train[:,0], train_predictions[:,0])),
                       str(r2_score(y_val[:,0], val_predictions[:,0])),
                       str(r2_score(y_test[:,0], test_predictions[:,0]))],
                      [str(spearmanr(y_train[:,0], train_predictions[:,0])[0]),
                       str(spearmanr(y_val[:,0], val_predictions[:,0])[0]),
                       str(spearmanr(y_test[:,0], test_predictions[:,0])[0])]]

      istarr_scores = [[str(r2_score(y_train[:,1], train_predictions[:,1])),
                        str(r2_score(y_val[:,1], val_predictions[:,1])),
                        str(r2_score(y_test[:,1], test_predictions[:,1]))],
                       [str(spearmanr(y_train[:,1], train_predictions[:,1])[0]),
                        str(spearmanr(y_val[:,1], val_predictions[:,1])[0]),
                        str(spearmanr(y_test[:,1], test_predictions[:,1])[0])]]

      # write r2 and spearman scores for all of train, test, and val sets and starr & istarr
      write_results_to_file(dir_path+"/results_starr.csv", starr_scores)
      write_results_to_file(dir_path+"/results_istarr.csv", istarr_scores)

    else:
      # calculate all scores from
      scores = [[str(r2_score(y_train, saved_model.predict(X_train).reshape(1, -1)[0])),
                 str(r2_score(y_val, saved_model.predict(X_val).reshape(1, -1)[0])),
                 str(r2_score(y_test, saved_model.predict(X_test).reshape(1, -1)[0]))
                 ],
                [str(spearmanr(y_train, saved_model.predict(X_train).reshape(1, -1)[0])[0]),
                 str(spearmanr(y_val, saved_model.predict(X_val).reshape(1, -1)[0])[0]),
                 str(spearmanr(y_test, saved_model.predict(X_test).reshape(1, -1)[0])[0])
                ]]

      # write r2 and spearman scores for all of train, test, and val sets
      write_results_to_file(dir_path+"/results.csv", scores)


def write_results_to_file(path, scores):
    # creates a file at path address, writes scores from scores nested list to output
    with open(path, "w") as f:
      f.write(",train,val,test\n")
      f.write("r2,"+scores[0][0]+","+scores[0][1]+","+scores[0][2]+"\n")
      f.write("spearman,"+scores[1][0]+","+scores[1][1]+","+scores[1][2]+"\n")


In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val(args)  # get dataset

# # models
model = get_model(args)  # instantiate and init model

# # create path to folder with results 
date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# arch_settings = str(args.conv_one_set)+str(args.conv_two_set)+str(args.conv_three_set)+str(args.linear_mapping)
# dir_path = "experiments/exp_"+date+"_"+arch_settings+"_lr"+str(args.learning_rate)+"_bs"+str(args.batch_size)+"_ep"+str(args.num_epochs)
dir_path = "experiments/xgboost"+date
!mkdir {dir_path}
# # compile model
# # model.compile(optimizer=Adam(lr=args.learning_rate),  # CHANGE IF WE WANT TO CHANGE OPTIM
# #               loss='mean_squared_error')
# model.compile(loss='mean_squared_error', optimizer='adam')


# # init callbacks
# logdir = os.path.join(dir_path, "logs")
# tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g
# es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20, min_delta=0.001)
# mc_callback = ModelCheckpoint(dir_path+'/best_model.h5', monitor='val_loss', save_best_only=True)

# train model
model.fit(X_train, y_train)

# save training history
# hist_df = pd.DataFrame(history.history) 
# hist_df.to_csv(dir_path+'/training_history.csv')

# load best model according to val_loss
# saved_model = load_model(dir_path+'/best_model.h5')

# write r2 and spearman scores for all of train, test, and val sets
save_results(args, dir_path, X_train, X_test, X_val, y_train, y_test, y_val, model)

# write all args to text file for reproducibility 
# json.dump(vars(args), open(dir_path+"/settings.txt", "w"))  # https://www.kite.com/python/answers/how-to-save-a-dictionary-to-a-file-in-python



read csv
including A
including T
including C
including G


In [None]:
args = {"data_path":"data/processed/arabidopsis_every_16.csv",
        "include_starr":1,
        "include_istarr":0,
        "shuffle":0}

mapping = {"A": [1.0, 0.0, 0.0, 0.0], "T": [0.0, 0.0, 0.0, 1.0], "C": [0.0, 1.0, 0.0, 0.0], "G": [0.0, 0.0, 1.0, 0.0]}  # cross referenced with kipoi data loader

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val(args)  # get dataset

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)
# # models
model = get_model(args)  # instantiate and init model

# # create path to folder with results 
date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# arch_settings = str(args.conv_one_set)+str(args.conv_two_set)+str(args.conv_three_set)+str(args.linear_mapping)
# dir_path = "experiments/exp_"+date+"_"+arch_settings+"_lr"+str(args.learning_rate)+"_bs"+str(args.batch_size)+"_ep"+str(args.num_epochs)
dir_path = "experiments/test_starr"

# # compile model
# # model.compile(optimizer=Adam(lr=args.learning_rate),  # CHANGE IF WE WANT TO CHANGE OPTIM
# #               loss='mean_squared_error')
model.compile(loss='mean_squared_error', optimizer='adam')


# # init callbacks
logdir = os.path.join(dir_path, "logs")
tensorboard_callback = keras.callbacks.TensorBoard(logdir, histogram_freq=1)  # https://stackoverflow.com/questions/59894720/keras-and-tensorboard-attributeerror-sequential-object-has-no-attribute-g
es_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=20, min_delta=0.001)
mc_callback = ModelCheckpoint(dir_path+'/best_model.h5', monitor='val_loss', save_best_only=True)

# train model
history = model.fit(X_train, y_train,
                    epochs=100,
                    batch_size=512,
                    validation_data=(X_val, y_val),
                    callbacks=[tensorboard_callback, es_callback, mc_callback])

# save training history
hist_df = pd.DataFrame(history.history) 
hist_df.to_csv(dir_path+'/training_history.csv')

# load best model according to val_loss
saved_model = load_model(dir_path+'/best_model.h5')

# write r2 and spearman scores for all of train, test, and val sets
save_results(args, dir_path, X_train, X_test, X_val, y_train, y_test, y_val, saved_model)

# write all args to text file for reproducibility 
# json.dump(vars(args), open(dir_path+"/settings.txt", "w"))  # https://www.kite.com/python/answers/how-to-save-a-dictionary-to-a-file-in-python



(994521, 7)
(677510, 145, 4) (677510,) (161051, 145, 4) (161051,) (155960, 145, 4) (155960,)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 00049: early stopping
