In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np

In [2]:
# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-multi-species")
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-multi-species")

  Referenced from: <FB2FD416-6C4D-3621-B677-61F07C02A3C5> /Users/jasmineliu/miniforge3/envs/CS2952G/lib/python3.9/site-packages/torchvision/image.so
  warn(


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
max_length = tokenizer.model_max_length

In [4]:
# def extract_expression(df):
#     tissue_start = df.columns.get_loc('Adipose_Subcutaneous')
#     n_tissues = 218
#     expression_vals = df.iloc[:, tissue_start : tissue_start + n_tissues]
#     tissues = list(expression_vals.columns)
#     sequences = df['seq']
#     return sequences, expression_vals, tissues

def extract_expression(df):
    tissue_start = df.columns.get_loc('type') + 1
    tissue_end = df.columns.get_loc('seq')
    expression_vals = df.iloc[:, tissue_start : tissue_end]
    tissues = list(expression_vals.columns)
    sequences = df['seq']
    return sequences, expression_vals, tissues

In [5]:
class GeneExpressionDataset(Dataset):
    """Some Information about MyDataset"""
    def __init__(self, dataset, log, tokenizer, max_length):
        self.dataset = dataset
        self.log = log
        self.tokenizer = tokenizer
        self.max_len = max_length
        self.sequences, self.expression_vals, self.tissues = extract_expression(self.dataset)

    def __getitem__(self, index):
        # get sequence from dataframe
        sequence = self.sequences.iloc[index]
        # get expression value from dataframe
        expression_values = np.array(self.expression_vals.iloc[index])
        # convert expression value to log (add )
        log_expression_values = np.log(expression_values + 1e-8)
        # tokenize sequence 
        tokens_ids = self.tokenizer(sequence, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
        return tokens_ids, log_expression_values

    def __len__(self):
        return len(self.dataset)

In [7]:
geneanno_merged = pd.read_csv("./data/sequence_exp.csv") # filepath of file generated by generate_seq.py
geneanno_merged = geneanno_merged.drop('Unnamed: 0', axis=1)
geneanno_merged.head()

Unnamed: 0,id,symbol,seqnames,strand,TSS,CAGE_representative_TSS,type,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,...,GM12878.1,HELA,HEPG2,HMEC,HSMM,HUVEC,K562.1,NHEK,NHLF,seq
0,ENSG00000000419,DPM1,chr20,-,49575092,49575069,protein_coding,22.491345,23.243749,29.331713,...,79.197,107.098,62.811,42.386,54.869,16.652,73.719,56.578,56.371,TTGGTCACTGTCTCCGCTGGGGGTGGTTGGGGGAATATGCAGCGAT...
1,ENSG00000000457,SCYL3,chr1,-,169863408,169863037,protein_coding,4.539384,3.732321,4.27198,...,11.082,8.814,2.646,2.483,2.527,2.549,7.651,4.967,3.714,tgtttttcagtccaaacctattcctgaatctgtgtagccatccctt...
2,ENSG00000000460,C1orf112,chr1,+,169631245,169764186,protein_coding,1.048864,0.769213,0.658967,...,13.743,25.369,3.373,4.646,2.179,4.099,22.103,3.29,2.491,gtaaccaaaacagcatggtactggtactgagaggtgacagcgtgct...
3,ENSG00000000938,FGR,chr1,-,27961788,27961654,protein_coding,14.626878,15.188205,4.254194,...,98.303,0.016,0.008,0.052,0.001,0.137,0.016,0.047,0.0,tacaggcgcccgccaccacgcccggctaattttttgatttttagta...
4,ENSG00000000971,CFH,chr1,+,196621008,196621174,protein_coding,74.251434,37.978969,20.765548,...,0.002,124.801,0.007,0.419,39.724,8.464,41.504,1.348,32.725,ttatccattacacctaaacataaattatgtcccagtgtttcactac...


In [8]:
train_df = geneanno_merged[geneanno_merged['seqnames'] != 'chr8']
test_df = geneanno_merged[geneanno_merged['seqnames'] == 'chr8']

In [9]:
train_dataset = GeneExpressionDataset(train_df, True, tokenizer, max_length)
test_dataset = GeneExpressionDataset(test_df, True, tokenizer, max_length)

In [11]:
# Mouse data
geneanno_merged_mouse = pd.read_csv("./data/sequence_exp_mouse.csv")
geneanno_merged_mouse = geneanno_merged_mouse.drop('Unnamed: 0', axis=1)
geneanno_merged_mouse.head()

Unnamed: 0,id,symbol,seqnames,strand,TSS,type,CD4_positive_naive_resting_alpha_beta_T_cell,CD8_positive_naive_resting_alpha_beta_T_cell,hippocampus,heart,gastrocnemius,monocyte,neutrophil,T_cell,left_cerebral_cortex,adrenal_gland,B_cell,seq
0,ENSMUSG00000098104.1,Gm6085,chr1,-,4687934,processed_pseudogene,1.82,3.05,0.6,0.52,0.44,1.85,0.69,2.09,0.62,0.94,2.12,GGGCGAGCTGCCCAGGGATGTGCGCAAGTCACAGGCCCGACTCCTT...
1,ENSMUSG00000033845.13,Mrpl15,chr1,-,4773206,protein_coding,14.03,19.64,18.1,48.26,45.57,10.01,8.99,19.71,17.51,38.21,12.48,GACCAGTACTCGTTTCACTTAGGGTGGAGGAGAGAGCTCTGATGTT...
2,ENSMUSG00000102275.1,Gm37144,chr1,-,4778063,TEC,1.27,1.03,0.61,0.09,0.11,0.06,0.21,0.58,0.18,0.16,0.72,TTATAAATATCCACCATTATCAAATATAGATTGGGAACTAGTCTCA...
3,ENSMUSG00000025903.14,Lypla1,chr1,+,4807788,protein_coding,18.79,17.56,7.69,23.44,28.44,23.44,16.41,18.63,10.09,32.49,22.67,CACCCCAACTTGATGTATTCTTGTCCTCTCTAGTGAGATAATGGTA...
4,ENSMUSG00000033813.15,Tcea1,chr1,+,4857814,protein_coding,17.78,16.37,18.7,10.11,9.04,19.31,9.42,18.24,17.45,24.99,22.17,TCGAGACAGGGTTTCTCTGTGTAGCTCTGGCTGTCCTGGAACTCAC...


In [12]:
mouse_train_df = geneanno_merged_mouse[geneanno_merged_mouse['seqnames'] != 'chr8']
mouse_test_df = geneanno_merged_mouse[geneanno_merged_mouse['seqnames'] == 'chr8']

In [13]:
mouse_train_dataset = GeneExpressionDataset(mouse_train_df, True, tokenizer, max_length)
mouse_test_dataset = GeneExpressionDataset(mouse_test_df, True, tokenizer, max_length)

In [None]:
# First row of human training data expression values
print(train_dataset[0][1])

[3.11313058 3.14603622 3.37866929 3.08798982 3.05522741 3.06032885
 3.02593536 2.38994331 2.55926272 2.54449142 2.81869746 2.72902872
 2.66026702 2.76062388 2.51062931 2.65230778 2.54738399 2.42287895
 2.40282968 2.43140164 3.20257365 3.63915418 3.33537023 3.11914714
 3.14804863 3.04143845 3.05427477 3.04701523 3.1314471  3.01098695
 3.16427261 2.93889406 2.89378867 2.91724915 2.7280068  3.02988663
 2.97853312 2.89149303 3.11007283 3.18843496 2.86086668 3.02116187
 3.10360548 3.07332888 3.04856158 3.04656143 3.25992725 3.07564341
 3.54795929 3.17451898 3.20160469 3.13335813 2.0047193  2.92154738
 3.78962936 3.63429126 3.71064095 3.17366923 2.91452222 3.92296295
 2.9606231  3.54240755 3.84844402 3.26556855 3.72472911 3.27695604
 4.13820195 4.13091754 3.6319122  4.11879324 4.3827139  5.02683667
 3.7215483  4.29804702 4.08066813 3.68286138 4.53285484 4.58868505
 4.45829299 3.80477181 3.75612914 3.05376531 3.34638915 4.59091912
 4.77178571 4.51535488 5.42064561 4.45881405 4.48384965 4.3048

In [16]:
# First row of mouse training data expression values
print(mouse_train_dataset[0][1])

[ 0.59883651  1.11514159 -0.51082561 -0.65392645 -0.82098053  0.61518564
 -0.37106367  0.73716407 -0.47803578 -0.06187539  0.75141609]
