In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
df = pd.read_csv('data/miRNA-Target.csv', index_col = 0)
df.head()

Unnamed: 0,miRNA_ID,miRNA,Target_ID,Target,Y
0,ath-miR398c-3p,UGUGUUCUCAGGUCACCCCUG,817365,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,1
1,ath-miR398b-3p,UGUGUUCUCAGGUCACCCCUG,817365,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,1
2,ath-miR398b-3p,UGUGUUCUCAGGUCACCCCUG,837405,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,1
3,ath-miR398a-3p,UGUGUUCUCAGGUCACCCCUU,817365,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,1
4,ath-miR398a-3p,UGUGUUCUCAGGUCACCCCUU,837405,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,1


In [3]:
df = df.drop(['miRNA_ID', 'Target_ID', 'Y'], axis=1)
df.head()

Unnamed: 0,miRNA,Target
0,UGUGUUCUCAGGUCACCCCUG,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...
1,UGUGUUCUCAGGUCACCCCUG,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...
2,UGUGUUCUCAGGUCACCCCUG,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...
3,UGUGUUCUCAGGUCACCCCUU,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...
4,UGUGUUCUCAGGUCACCCCUU,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...


In [4]:
def generate_kmers(sequence, k):
    """
    Generate k-mers from a given sequence.
    
    Args:
        sequence (str): The input sequence (e.g., DNA sequence).
        k (int): The length of each k-mer.
        
    Returns:
        List[str]: A list of k-mers.
    """
    return [sequence[i:i+k] for i in range(0, len(sequence), k)]

In [5]:
df['miRNA_kmers'] = df['miRNA'].apply(lambda seq: generate_kmers(seq, 1))
df['Target_kmers'] = df['Target'].apply(lambda seq: generate_kmers(seq, 3))
df.head()

Unnamed: 0,miRNA,Target,miRNA_kmers,Target_kmers
0,UGUGUUCUCAGGUCACCCCUG,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAA, TNT, ILA, FSS, PSR, LLI, PPS, SNP, STL, ..."
1,UGUGUUCUCAGGUCACCCCUG,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAA, TNT, ILA, FSS, PSR, LLI, PPS, SNP, STL, ..."
2,UGUGUUCUCAGGUCACCCCUG,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAK, GVA, VLN, SSE, GVT, GTI, FFT, QEG, DGV, ..."
3,UGUGUUCUCAGGUCACCCCUU,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAA, TNT, ILA, FSS, PSR, LLI, PPS, SNP, STL, ..."
4,UGUGUUCUCAGGUCACCCCUU,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAK, GVA, VLN, SSE, GVT, GTI, FFT, QEG, DGV, ..."


In [6]:
df['kmerization'] = df.apply(lambda row: row['miRNA_kmers'] + row['Target_kmers'], axis=1)
df.head()

Unnamed: 0,miRNA,Target,miRNA_kmers,Target_kmers,kmerization
0,UGUGUUCUCAGGUCACCCCUG,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAA, TNT, ILA, FSS, PSR, LLI, PPS, SNP, STL, ...","[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ..."
1,UGUGUUCUCAGGUCACCCCUG,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAA, TNT, ILA, FSS, PSR, LLI, PPS, SNP, STL, ...","[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ..."
2,UGUGUUCUCAGGUCACCCCUG,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAK, GVA, VLN, SSE, GVT, GTI, FFT, QEG, DGV, ...","[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ..."
3,UGUGUUCUCAGGUCACCCCUU,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAA, TNT, ILA, FSS, PSR, LLI, PPS, SNP, STL, ...","[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ..."
4,UGUGUUCUCAGGUCACCCCUU,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,"[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...","[MAK, GVA, VLN, SSE, GVT, GTI, FFT, QEG, DGV, ...","[U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ..."


In [29]:
len(df['Target_kmers'].iloc[1000])

87

In [7]:
# Step 1: Flatten the list of k-mers
flattened_kmers = [kmer for sublist in df['Target_kmers'] for kmer in sublist]

# Step 2: Count the occurrences of each unique k-mer
kmer_counts = Counter(flattened_kmers)

# Step 3: Convert the Counter object to a DataFrame for better visualization
kmer_counts_df = pd.DataFrame(list(kmer_counts.items()), columns=['kmer', 'count'])

# Display the total number of unique k-mers and their counts
total_unique_kmers = len(kmer_counts)
print(f"Total number of unique k-mers: {total_unique_kmers}")
print(kmer_counts_df)

Total number of unique k-mers: 8453
     kmer  count
0     MAA  32659
1     TNT   8469
2     ILA  21937
3     FSS  23246
4     PSR  21646
...   ...    ...
8448  KUH      1
8449  RUI      3
8450  ULG      6
8451  GGU      2
8452  LUG      1

[8453 rows x 2 columns]


In [8]:
kmer_counts_df[kmer_counts_df['kmer'].str.len() < 3]

Unnamed: 0,kmer,count
101,QG,162
935,L,13377
968,TD,571
1256,QC,120
1474,QV,447
...,...,...
8428,IW,37
8429,CA,118
8430,NW,6
8431,YM,68


In [9]:
num_strings_with_X = df['Target'].apply(lambda x: 'X' in x).sum()
num_strings_with_B = df['Target'].apply(lambda x: 'B' in x).sum()
num_strings_with_Z = df['Target'].apply(lambda x: 'Z' in x).sum()
num_strings_with_J = df['Target'].apply(lambda x: 'J' in x).sum()

# Display the result
print(f"Number of target sequences containing 'X': {num_strings_with_X}")
print(f"Number of target sequences containing 'B': {num_strings_with_B}")
print(f"Number of target sequences containing 'Z': {num_strings_with_Z}")
print(f"Number of target sequences containing 'J': {num_strings_with_J}")

Number of target sequences containing 'X': 8
Number of target sequences containing 'B': 8
Number of target sequences containing 'Z': 0
Number of target sequences containing 'J': 0


In [42]:
df['Target'].apply(lambda x: 'X' in x)

0        False
1        False
2        False
3        False
4        False
         ...  
80011    False
80012    False
80013    False
80014    False
80015    False
Name: Target, Length: 360074, dtype: bool

In [10]:
df[df['Target'].str.contains('X')]

Unnamed: 0,miRNA,Target,miRNA_kmers,Target_kmers,kmerization
256936,UGUCAGUUUGUCAAAUACCCCA,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, G, U, C, A, G, U, U, U, G, U, C, A, A, A, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, G, U, C, A, G, U, U, U, G, U, C, A, A, A, ..."
257750,CCACUGCCCCAGGUGCUGCU,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[C, C, A, C, U, G, C, C, C, C, A, G, G, U, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[C, C, A, C, U, G, C, C, C, C, A, G, G, U, G, ..."
258219,UAGCAGCACAUAAUGGUUUGUG,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, A, G, C, A, G, C, A, C, A, U, A, A, U, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, A, G, C, A, G, C, A, C, A, U, A, A, U, G, ..."
258714,UGUGCAAAUCCAUGCAAAACUGA,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, G, U, G, C, A, A, A, U, C, C, A, U, G, C, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, G, U, G, C, A, A, A, U, C, C, A, U, G, C, ..."
259069,AUGUAUGUGUGCAUGUGCAUGU,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[A, U, G, U, A, U, G, U, G, U, G, C, A, U, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[A, U, G, U, A, U, G, U, G, U, G, C, A, U, G, ..."
260422,UCUUUGGUUAUCUAGCUGUAUGA,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, C, U, U, U, G, G, U, U, A, U, C, U, A, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, C, U, U, U, G, G, U, U, A, U, C, U, A, G, ..."
260776,UAAGGCACGCGGUGAAUGCC,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, A, A, G, G, C, A, C, G, C, G, G, U, G, A, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, A, A, G, G, C, A, C, G, C, G, G, U, G, A, ..."
44836,UUCAAGUAAUCCAGGAUAGGCU,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, U, C, A, A, G, U, A, A, U, C, C, A, G, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, U, C, A, A, G, U, A, A, U, C, C, A, G, G, ..."


In [11]:
df[df['Target'].str.contains('B')]

Unnamed: 0,miRNA,Target,miRNA_kmers,Target_kmers,kmerization
256936,UGUCAGUUUGUCAAAUACCCCA,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, G, U, C, A, G, U, U, U, G, U, C, A, A, A, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, G, U, C, A, G, U, U, U, G, U, C, A, A, A, ..."
257750,CCACUGCCCCAGGUGCUGCU,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[C, C, A, C, U, G, C, C, C, C, A, G, G, U, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[C, C, A, C, U, G, C, C, C, C, A, G, G, U, G, ..."
258219,UAGCAGCACAUAAUGGUUUGUG,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, A, G, C, A, G, C, A, C, A, U, A, A, U, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, A, G, C, A, G, C, A, C, A, U, A, A, U, G, ..."
258714,UGUGCAAAUCCAUGCAAAACUGA,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, G, U, G, C, A, A, A, U, C, C, A, U, G, C, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, G, U, G, C, A, A, A, U, C, C, A, U, G, C, ..."
259069,AUGUAUGUGUGCAUGUGCAUGU,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[A, U, G, U, A, U, G, U, G, U, G, C, A, U, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[A, U, G, U, A, U, G, U, G, U, G, C, A, U, G, ..."
260422,UCUUUGGUUAUCUAGCUGUAUGA,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, C, U, U, U, G, G, U, U, A, U, C, U, A, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, C, U, U, U, G, G, U, U, A, U, C, U, A, G, ..."
260776,UAAGGCACGCGGUGAAUGCC,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, A, A, G, G, C, A, C, G, C, G, G, U, G, A, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, A, A, G, G, C, A, C, G, C, G, G, U, G, A, ..."
44836,UUCAAGUAAUCCAGGAUAGGCU,MEDEERQRKLAAGKAKLARFRQRKAQYDGDIPKKQKKKRTSSSKHD...,"[U, U, C, A, A, G, U, A, A, U, C, C, A, G, G, ...","[MED, EER, QRK, LAA, GKA, KLA, RFR, QRK, AQY, ...","[U, U, C, A, A, G, U, A, A, U, C, C, A, G, G, ..."


In [12]:
df = df[~df['Target'].str.contains('X')]

In [13]:
df[df['Target'].str.contains('B')]

Unnamed: 0,miRNA,Target,miRNA_kmers,Target_kmers,kmerization


In [14]:
kmer_df = df['kmerization']
kmer_df

0        [U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...
1        [U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...
2        [U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...
3        [U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...
4        [U, G, U, G, U, U, C, U, C, A, G, G, U, C, A, ...
                               ...                        
80011    [G, A, A, G, U, G, C, U, U, C, G, A, U, U, U, ...
80012    [U, G, A, G, G, U, A, G, U, A, G, G, U, U, G, ...
80013    [C, A, A, A, G, A, A, U, U, C, U, C, C, U, U, ...
80014    [A, C, U, G, C, U, G, A, G, C, U, A, G, C, A, ...
80015    [U, G, A, U, A, U, G, U, U, U, G, A, U, A, U, ...
Name: kmerization, Length: 360066, dtype: object

In [15]:
kmer_df.to_csv('data/miRNA-Target-kmerization.csv')

In [16]:
instruction_set_df = pd.DataFrame()
instruction_set_df['Input'] = df['Target']
instruction_set_df['Context'] = "This task involves mapping target sequences to their corresponding miRNA sequences."
instruction_set_df['Instruction'] = "Predict the miRNA sequence for this target."
instruction_set_df['Output'] = df['miRNA']
instruction_set_df

Unnamed: 0,Input,Context,Instruction,Output
0,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUG
1,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUG
2,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUG
3,MAATNTILAFSSPSRLLIPPSSNPSTLRSSFRGVSLNNNNLHRLQS...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUU
4,MAKGVAVLNSSEGVTGTIFFTQEGDGVTTVSGTVSGLKPGLHGFHV...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGUGUUCUCAGGUCACCCCUU
...,...,...,...,...
80011,MAELQMLLEEEIPGGRRALFDSYTNLERVADYCENNYIQSADKQRA...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,GAAGUGCUUCGAUUUUGGGGUGU
80012,MAGVFPYRGPGNPVPGPLAPLPDYMSEEKLQEKARKWQQLQAKRYA...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,UGAGGUAGUAGGUUGUGUGGUU
80013,MSKDLVTFGDVAVNFSQEEWEWLNPAQRNLYRKVMLENYRSLVSLG...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,CAAAGAAUUCUCCUUUUGGGCU
80014,MDPFRPSFRGQSPIHPSQCQAVRMPGCWPQASKPLDPALGRGAPAG...,This task involves mapping target sequences to...,Predict the miRNA sequence for this target.,ACUGCUGAGCUAGCACUUCCCG


In [17]:
instruction_set_df.to_csv('data/llama-pretrain-instruction-set.csv')