In [1]:
import pandas as pd
import numpy as np
import re, os
import pyfastx
from Bio import SeqIO
from collections import Counter

1. filter test

use blastn filter sequences in test which are similar with sequences in train 

In [73]:
# 1 do 00_other_do_blastn.py get blastn result
filter_path = '' # the ID of RNA sequences which can matched training set
path = '' # all sequence information of sequences 
test_filter = pd.read_csv(filter_path, sep='\t', header=None)
test = pd.read_csv(path)

In [None]:
test_filter

In [75]:
filter_idx = [int(x[14:])-1 for x in test_filter.iloc[:, 0].unique().tolist()]

In [76]:
test_filter = test[~test.index.isin(filter_idx)]#.value_counts('tag')

In [None]:
test_filter

In [78]:
# tests which can use to predict result or validate the performance of model
test_filter.to_csv('', index=False)

02. GC% statistics

- calcaulate genomes 

In [None]:
def calculate_gc_content(seq):
    g_count = seq.count('G')
    c_count = seq.count('C')
    total_count = len(seq)
    return (g_count + c_count) / total_count * 100 if total_count > 0 else 0

# folder_path = "./fastas/train_part/coding_bacteria"  
folder_path = "./fastas/application_part/genomics"  
dict = {'Name': [], 'GC Content': [], 'Category': []}
for filename in os.listdir(folder_path):
    bac_name = ' '.join(filename.split('_')[:2])

    if filename.endswith(".fasta") or filename.endswith(".fa") or filename.endswith(".fna"):  # 检查文件扩展名
        file_path = os.path.join(folder_path, filename)
        
        record_r = ''
        # read the sequences and calculate %GC
        for record in SeqIO.parse(file_path, "fasta"):
            record_r += str(record.seq)

        gc_content = calculate_gc_content(record_r)
        print(gc_content)
        # group by %GC
        if gc_content > 60:
            category = 'high'
        elif 40 <= gc_content <= 60:
            category = 'middle'
        else:
            category = 'low'
        dict['Name'].append(bac_name)
        dict['GC Content'].append(gc_content)
        dict['Category'].append(category)
        print(f"Filename: {bac_name}, GC Content: {gc_content:.2f}%, Category: {category}")


In [None]:
pd.DataFrame(dict)

In [None]:
import matplotlib.pyplot as plt

category_counts = Counter(dict['Category'])


labels = category_counts.keys()
sizes = category_counts.values()

plt.figure(figsize=(7, 7))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff','#99ff99'])
plt.axis('equal')  
plt.title('GC% Category Distribution')
plt.show()

- calculate train & test sequences

In [4]:
import os
from Bio import SeqIO

def calculate_gc_content(seq):
    g_count = seq.count('G')
    c_count = seq.count('C')
    total_count = len(seq)
    return (g_count + c_count) / total_count * 100 if total_count > 0 else 0


def get_precent_gc(fasta_file_path): 
    
    gc_data = {'Name': [], 'GC Content': [], 'Category': []}

 
    combined_sequence = ""
    for record in SeqIO.parse(fasta_file_path, "fasta"):
        
        sequence_name = record.id
        sequence = str(record.seq)
        gc_content = calculate_gc_content(sequence)
        
        if gc_content > 60:
            category = 'high'
        elif 40 <= gc_content <= 60:
            category = 'middle'
        else:
            category = 'low'
        
        gc_data['Name'].append(sequence_name)
        gc_data['GC Content'].append(gc_content)
        gc_data['Category'].append(category)

        print(f"Sequence: {sequence_name}, GC Content: {gc_content:.2f}%, Category: {category}")

    return gc_data

In [None]:
# fastas of sequences which need to calculate %GC
fasta_file_path_train = ""
fasta_file_path_test = ""
train_gc_data = get_precent_gc(fasta_file_path_train)
test_gc_data = get_precent_gc(fasta_file_path_test)

In [None]:
import matplotlib.pyplot as plt
category_counts = Counter(train_gc_data['Category'])

labels = category_counts.keys()
sizes = category_counts.values()

plt.figure(figsize=(7, 7))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff','#99ff99'])
plt.axis('equal')  

plt.savefig('category_distribution_train.png', dpi=300)
plt.show()

In [None]:
category_counts = Counter(test_gc_data['Category'])


labels = category_counts.keys()
sizes = category_counts.values()

plt.figure(figsize=(7, 7))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff','#99ff99'])
plt.axis('equal')  
plt.savefig('category_distribution_test.png', dpi=300)
plt.show()

02. get model inputs

In [26]:
import pandas as pd
import numpy as np
import re
import pyfastx
from collections import Counter

In [None]:
def is_valid_rna_sequence(sequence):
    valid_bases = ['A', 'T', 'C', 'G']
    for base in sequence:
        if base not in valid_bases:
            return False
    
    return True

def generate_3mers(sequence, k=3):
    kmer_list = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    return kmer_list


for name in ['Mycoplasma_flocculare', 'Mycoplasma_hyopneumoniae', 'Mycoplasma_hyorhinis']:
    print(name, ':')
    dict = {"sequence": [], "tag":[]}
    file_path = f'./fastas/application_part/{name}.fasta'
    fa = pyfastx.Fastx(file_path)
    cnt = 0
    for _, seq in fa:
        if 50 <= len(seq) <= 500 and is_valid_rna_sequence(seq):
            kmers = generate_3mers(seq, k=3)
            dict['sequence'].append(' '.join(kmers))
            dict['tag'].append([0, 1])
    dict_df = pd.DataFrame(dict)
    dict_df.to_csv(f'{name}.csv', index=False)

In [86]:
sim = pd.read_csv("")

In [None]:
sim['tag'].value_counts()

In [88]:
sim = pd.read_csv("")

In [None]:
sim['tag'].value_counts()