In [29]:
import pandas as pd
import numpy as np

from collections import defaultdict

from sklearn.model_selection import train_test_split

# Motifs

In [4]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/'

In [21]:
table_motifs = pd.read_excel(data_dir + 'dominguez_2018/1-s2.0-S1097276518303514-mmc4.xlsx',
             sheet_name =1, skiprows=1)

table_motifs = table_motifs.iloc[:,0::2].T

random_motifs = {'ACTCC', 'ACTTA', 'ATGTC', 'CCACA', 'TGACT', 'TTCCG', 'TTGGG', 'GTGTA', 'ACAGG', 'TCGTA'} #motifs which don't overlap with the table

In [30]:
motifs_df = []

for protein_name, motifs in table_motifs.iterrows():
    motifs_df.extend([(protein_name, motif) for motif in motifs if isinstance(motif,str)])
    
motifs_df.extend([('NA',motif) for motif in random_motifs])

In [31]:
motifs_df = pd.DataFrame(motifs_df, columns=['protein','motif'])

In [32]:
motifs_df.to_csv(data_dir + 'motif_predictions/motifs.csv', index=None) #first motif for each protein is the top motif from the table

# Train/test split

In [41]:
#get human 3'UTR sequences

human_fasta = data_dir + 'fasta/240_mammals/species/Homo_sapiens.fa'

dataset = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            seq_name = line[1:].split(':')[0]
        else:
            dataset[seq_name] += line.rstrip().upper()
            
dataset = pd.DataFrame(list(dataset.items()), columns=['seq_name','seq'])

In [42]:
chrom = dataset.seq_name.apply(lambda x:x.split('_')[-3])

In [43]:
#75%/25% train/test split, stratified by chromosome

train_df, test_df  = train_test_split(dataset, test_size=0.25, random_state=1,stratify=chrom)

In [44]:
train_df.to_csv(data_dir + '/motif_predictions/split_75_25/train.csv', index=None)
test_df.to_csv(data_dir + '/motif_predictions/split_75_25/test.csv', index=None)