Import packages

In [1]:
import numpy as np
import pandas as pd
from seekr.kmer_counts import BasicCounter
from seekr.pearson import pearson
from seekr.fasta_reader import Reader

Import the normalised late version of human all GENCODE, the Paroedura picta (Madagascar ground gecko) and the sequences of lncRNAs of interest (MEG3, EVX1AS, NEAT1).

In [2]:
gencode = 'v41_all.fa'
MEG3 = 'MEG3.fa'
EVX1AS = 'EVX1AS.fa'
NEAT1 = 'NEAT1.fa'
lncRNAs = 'Paroedura_picta_denovo_transcriptome.fasta'

In [3]:
# Make sure each lncRNA in other_lncs.fa has a unique name
headers = Reader(lncRNAs).get_headers()
names = [h.strip('>') + f'_{i}' for i, h in enumerate(headers)]

The pipeline described in https://github.com/CalabreseLab/seekr is carried out for 3 and 4 k, as well as for the 3 human lncRNAs.Results are stored locally.

In [4]:
for k in range(3, 5):
    # Make normalization vectors
    gencode_counter = BasicCounter(gencode, k=k)
    gencode_counter.get_counts()
    mean_path = f'mean_{k}mers.npy'
    std_path = f'std_{k}mers.npy'
    np.save(mean_path, gencode_counter.mean)
    np.save(std_path, gencode_counter.std)
    # Count *k*-mers
    MEG3_counter = BasicCounter(MEG3,
                                outfile=f'{k}mers_MEG3.npy',
                                mean=mean_path,
                                std=std_path,
                                k=k)
    lncs_counter = BasicCounter(lncRNAs,
                                outfile=f'{k}mers_lncs.npy',
                                mean=mean_path,
                                std=std_path,
                                k=k)
    MEG3_counter.make_count_file(names=['MEG3'])
    lncs_counter.make_count_file(names=names)
  
    # Find similarities
    sim = pearson(MEG3_counter.counts,
                  lncs_counter.counts,
                  outfile=f'MEG3_vs_lncs_{k}mers.npy')
    # Save labeled .csv file of similarities.
    sim_df = pd.DataFrame(sim, ['MEG3'], names)
    sim_df.to_csv(f'MEG3_vs_lncs_{k}mers.csv')
    sim_df_transpose = sim_df.transpose()
    sim_df_transpose = sim_df_transpose.sort_values(by=['MEG3'], ascending=False)
    sim_df_t_head = sim_df_transpose.head(1000)
    sim_df_t_head.to_csv(f'MEG3_vs_lncs_{k}mers_head.csv')

  0%|          | 0/251236 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/996336 [00:00<?, ?it/s]

  0%|          | 0/251236 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/996336 [00:00<?, ?it/s]

In [5]:
for k in range(3, 5):
    # Make normalization vectors
    gencode_counter = BasicCounter(gencode, k=k)
    gencode_counter.get_counts()
    mean_path = f'mean_{k}mers.npy'
    std_path = f'std_{k}mers.npy'
    np.save(mean_path, gencode_counter.mean)
    np.save(std_path, gencode_counter.std)
    # Count *k*-mers
    NEAT1_counter = BasicCounter(NEAT1,
                                outfile=f'{k}mers_NEAT1.npy',
                                mean=mean_path,
                                std=std_path,
                                k=k)
    lncs_counter = BasicCounter(lncRNAs,
                                outfile=f'{k}mers_lncs.npy',
                                mean=mean_path,
                                std=std_path,
                                k=k)
    NEAT1_counter.make_count_file(names=['NEAT1'])
    lncs_counter.make_count_file(names=names)
  
    # Find similarities
    sim = pearson(NEAT1_counter.counts,
                  lncs_counter.counts,
                  outfile=f'NEAT1_vs_lncs_{k}mers.npy')
    # Save labeled .csv file of similarities.
    sim_df = pd.DataFrame(sim, ['NEAT1'], names)
    sim_df.to_csv(f'NEAT1_vs_lncs_{k}mers.csv')
    sim_df_transpose = sim_df.transpose()
    sim_df_transpose = sim_df_transpose.sort_values(by=['NEAT1'], ascending=False)
    sim_df_t_head = sim_df_transpose.head(1000)
    sim_df_t_head.to_csv(f'NEAT1_vs_lncs_{k}mers_head.csv')

  0%|          | 0/251236 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/996336 [00:00<?, ?it/s]

  0%|          | 0/251236 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/996336 [00:00<?, ?it/s]

In [6]:
for k in range(3, 5):
    # Make normalization vectors
    gencode_counter = BasicCounter(gencode, k=k)
    gencode_counter.get_counts()
    mean_path = f'mean_{k}mers.npy'
    std_path = f'std_{k}mers.npy'
    np.save(mean_path, gencode_counter.mean)
    np.save(std_path, gencode_counter.std)
    # Count *k*-mers
    EVX1AS_counter = BasicCounter(EVX1AS,
                                outfile=f'{k}mers_EVX1AS.npy',
                                mean=mean_path,
                                std=std_path,
                                k=k)
    lncs_counter = BasicCounter(lncRNAs,
                                outfile=f'{k}mers_lncs.npy',
                                mean=mean_path,
                                std=std_path,
                                k=k)
    EVX1AS_counter.make_count_file(names=['EVX1AS'])
    lncs_counter.make_count_file(names=names)
  
    # Find similarities
    sim = pearson(EVX1AS_counter.counts,
                  lncs_counter.counts,
                  outfile=f'EVX1AS_vs_lncs_{k}mers.npy')
    # Save labeled .csv file of similarities.
    sim_df = pd.DataFrame(sim, ['EVX1AS'], names)
    sim_df.to_csv(f'EVX1AS_vs_lncs_{k}mers.csv')
    sim_df_transpose = sim_df.transpose()
    sim_df_transpose = sim_df_transpose.sort_values(by=['EVX1AS'], ascending=False)
    sim_df_t_head = sim_df_transpose.head(1000)
    sim_df_t_head.to_csv(f'EVX1AS_vs_lncs_{k}mers_head.csv')

  0%|          | 0/251236 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/996336 [00:00<?, ?it/s]

  0%|          | 0/251236 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/996336 [00:00<?, ?it/s]