In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np

In [None]:
# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-multi-species")
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-multi-species")

In [None]:
max_length = tokenizer.model_max_length

In [57]:
def extract_expression(df):
    tissue_start = df.columns.get_loc('Adipose_Subcutaneous')
    n_tissues = 218
    expression_vals = df.iloc[:, tissue_start : tissue_start + n_tissues]
    tissues = list(expression_vals.columns)
    sequences = df['seq']
    return sequences, expression_vals, tissues

In [59]:
class GeneExpressionDataset(Dataset):
    """Some Information about MyDataset"""
    def __init__(self, dataset, log, tokenizer, max_length):
        self.dataset = dataset
        self.log = log
        self.tokenizer = tokenizer
        self.max_len = max_length
        self.sequences, self.expression_vals, self.tissues = extract_expression(self.dataset)

    def __getitem__(self, index):
        # get sequence from dataframe
        sequence = self.sequences.iloc[index]
        # get expression value from dataframe
        expression_values = np.array(self.expression_vals.iloc[index])
        # convert expression value to log
        log_expression_values = np.log(expression_values)
        # tokenize sequence 
        tokens_ids = self.tokenizer(sequence, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
        return tokens_ids, log_expression_values

    def __len__(self):
        return len(self.dataset)

In [None]:
geneanno_merged = pd.read_csv("./data/sequence_exp.csv") # filepath of file generated by generate_seq.py
geneanno_merged = geneanno_merged.drop('Unnamed: 0', axis=1)
geneanno_merged.head()

Unnamed: 0,id,symbol,seqnames,strand,TSS,CAGE_representative_TSS,type,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,...,GM12878.1,HELA,HEPG2,HMEC,HSMM,HUVEC,K562.1,NHEK,NHLF,seq
0,ENSG00000000419,DPM1,chr20,-,49575092,49575069,protein_coding,22.491345,23.243749,29.331713,...,79.197,107.098,62.811,42.386,54.869,16.652,73.719,56.578,56.371,TTGGTCACTGTCTCCGCTGGGGGTGGTTGGGGGAATATGCAGCGAT...
1,ENSG00000000457,SCYL3,chr1,-,169863408,169863037,protein_coding,4.539384,3.732321,4.27198,...,11.082,8.814,2.646,2.483,2.527,2.549,7.651,4.967,3.714,tgtttttcagtccaaacctattcctgaatctgtgtagccatccctt...
2,ENSG00000000460,C1orf112,chr1,+,169631245,169764186,protein_coding,1.048864,0.769213,0.658967,...,13.743,25.369,3.373,4.646,2.179,4.099,22.103,3.29,2.491,gtaaccaaaacagcatggtactggtactgagaggtgacagcgtgct...
3,ENSG00000000938,FGR,chr1,-,27961788,27961654,protein_coding,14.626878,15.188205,4.254194,...,98.303,0.016,0.008,0.052,0.001,0.137,0.016,0.047,0.0,tacaggcgcccgccaccacgcccggctaattttttgatttttagta...
4,ENSG00000000971,CFH,chr1,+,196621008,196621174,protein_coding,74.251434,37.978969,20.765548,...,0.002,124.801,0.007,0.419,39.724,8.464,41.504,1.348,32.725,ttatccattacacctaaacataaattatgtcccagtgtttcactac...


In [61]:
train_df = geneanno_merged[geneanno_merged['seqnames'] != 'chr8']
test_df = geneanno_merged[geneanno_merged['seqnames'] == 'chr8']

In [None]:
train_dataset = GeneExpressionDataset(train_df, True, tokenizer, max_length)
test_dataset = GeneExpressionDataset(test_df, True, tokenizer, max_length)