In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict

from sklearn.model_selection import train_test_split

# Motifs

In [84]:
motif_len = 5 #5 or 6

In [85]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/'

In [86]:
table_motifs = pd.read_excel(data_dir + 'dominguez_2018/1-s2.0-S1097276518303514-mmc4.xlsx',
             sheet_name = motif_len-4, skiprows=1)

In [87]:
res = []

for col_idx in range(0,len(table_motifs.columns),2):
    protein_name = table_motifs.columns[col_idx]
    protein_df = table_motifs.iloc[:,col_idx:col_idx+2]
    protein_df.columns = ['motif','stepwise_r']
    protein_df['protein'] = protein_name
    protein_df = protein_df[~protein_df['motif'].isna()]
    res.append(protein_df)

In [88]:
motifs_df = pd.concat(res)[['protein','motif','stepwise_r']]

In [89]:
if motif_len==5:
    random_motifs = {'ACTCC', 'ACTTA', 'ATGTC', 'CCACA', 'TGACT', 'TTCCG', 'TTGGG', 'GTGTA', 'ACAGG', 'TCGTA'} #motifs which don't overlap with the table
else:
    all_permutations = {''.join(p) for p in itertools.product('ACTG', repeat=motif_len)}
    table_motifs = set(motifs_df.motif)
    random_motifs = all_permutations-table_motifs
    np.random.seed(42)
    random_motifs = np.random.choice(list(random_motifs),10)

In [90]:
motifs_df = pd.concat([motifs_df,pd.DataFrame(list(random_motifs),columns=['motif'])])

In [91]:
motifs_df = pd.DataFrame(motifs_df, columns=['protein','motif'])

In [92]:
motifs_df.to_csv(data_dir + f'motif_predictions/motifs_k{motif_len}.csv', index=None) #first motif for each protein is the top motif from the table

# Train/test split

In [41]:
#get human 3'UTR sequences

human_fasta = data_dir + 'fasta/240_mammals/species/Homo_sapiens.fa'

dataset = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            seq_name = line[1:].split(':')[0]
        else:
            dataset[seq_name] += line.rstrip().upper()
            
dataset = pd.DataFrame(list(dataset.items()), columns=['seq_name','seq'])

In [42]:
chrom = dataset.seq_name.apply(lambda x:x.split('_')[-3])

In [43]:
#75%/25% train/test split, stratified by chromosome

train_df, test_df  = train_test_split(dataset, test_size=0.25, random_state=1,stratify=chrom)

In [44]:
train_df.to_csv(data_dir + '/motif_predictions/split_75_25/train.csv', index=None)
test_df.to_csv(data_dir + '/motif_predictions/split_75_25/test.csv', index=None)