In [1]:
def select_first_100_sequences(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        sequence_count = 0
        current_sequence = []
        
        for line in infile:
            if line.startswith('>'):
                if sequence_count > 0:
                    outfile.write(''.join(current_sequence))
                
                if sequence_count < 100:
                    outfile.write(line)
                    current_sequence = []
                    sequence_count += 1
                else:
                    break
            elif sequence_count <= 100:
                current_sequence.append(line)
        
        if sequence_count <= 100:
            outfile.write(''.join(current_sequence))

# Usage
input_fasta = 'processed.fasta'  # Replace with your input FASTA file name
output_fasta = 'output_first_100_ns3_processed.fasta'  # Replace with your desired output file name

select_first_100_sequences(input_fasta, output_fasta)
print(f"First 100 sequences saved to {output_fasta}")

First 100 sequences saved to output_first_100_ns3_processed.fasta


In [99]:
from Bio import SeqIO
import csv

def parse_accession_id(header):
    """Extract the accession ID from the FASTA header."""
    parts = header.split('|')
    if len(parts) > 1:
        return parts[1]
    return header

def compare_sequences(fasta_file, accession_id1, accession_id2):
    # Read the sequences from the fasta file
    sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        accession_id = parse_accession_id(record.id)
        sequences[accession_id] = record.seq
    
    # Get the sequences for the provided accession ids
    seq1 = sequences.get(accession_id1)
    seq2 = sequences.get(accession_id2)
    
    if not seq1 or not seq2:
        print(f"One or both accession ids not found in the fasta file.")
        return
    
    # Ensure the sequences are of the same length
    if len(seq1) != len(seq2):
        print("Sequences are not of the same length. Cannot compare position-wise.")
        return
    
    # Compare sequences position-wise and prepare the data for CSV
    data = []
    for i, (res1, res2) in enumerate(zip(seq1, seq2), start=1):
        target = 0 if res1 == res2 else 1
        data.append([i, target])
    
    # Write the data to a CSV file
    csv_file = f"{accession_id1}.csv"
    with open(csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Residue Position", "Target"])
        writer.writerows(data)
    
    print(f"Comparison complete. Results saved to {csv_file}")

# Example usage
fasta_file = "output_first_90_ns3_processed.fasta"
accession_id1 = input("Enter the first accession ID: ")
accession_id2 = input("Enter the second accession ID: ")
compare_sequences(fasta_file, accession_id1, accession_id2)

Enter the first accession ID:  11069.218.mat_peptide.9
Enter the second accession ID:  11069.525.mat_peptide.9


Comparison complete. Results saved to 11069.218.mat_peptide.9.csv


In [116]:
import os
import pandas as pd

def concatenate_csv_files(folder1, folder2, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Get the list of CSV files in the first folder
    files1 = {f for f in os.listdir(folder1) if f.endswith('.csv')}
    
    # Process each file in the first folder
    for file1 in files1:
        file2_path = os.path.join(folder2, file1)
        if os.path.exists(file2_path):
            file1_path = os.path.join(folder1, file1)
            
            # Read the CSV files
            df1 = pd.read_csv(file1_path)
            df2 = pd.read_csv(file2_path)
            
            # Ensure that both files have the column 'A'
            if 'Position' not in df1.columns or 'Position' not in df2.columns:
                print(f"Column 'A' missing in one of the files: {file1_path} or {file2_path}")
                continue
            
            # Merge dataframes on column 'A'
            df_merged = pd.merge(df1, df2[['Position', 'F3']], on='Position', how='left')
            
            # Save the result to the output folder
            output_file_path = os.path.join(output_folder, file1)
            df_merged.to_csv(output_file_path, index=False)
            print(f"File saved: {output_file_path}")

# Example usage
folder1 = 'feature1'
folder2 = 'feature2'
output_folder = 'combine_f1f2'
concatenate_csv_files(folder1, folder2, output_folder)

File saved: combine_f1f2/fig|11069.646.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.5787.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.3969.mat_peptide.10|.csv
File saved: combine_f1f2/fig|11069.2390.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.2987.mat_peptide.7|.csv
File saved: combine_f1f2/fig|11069.2701.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.1264.mat_peptide.10|.csv
File saved: combine_f1f2/fig|11069.1616.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.2706.mat_peptide.7|.csv
File saved: combine_f1f2/fig|11069.2936.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.5814.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.2623.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.3968.mat_peptide.10|.csv
File saved: combine_f1f2/fig|11069.1533.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.1776.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.387.mat_peptide.9|.csv
File saved: combine_f1f2/fig|11069.1429.mat_peptide.9|.

In [117]:
import os
import pandas as pd

# Folder containing CSV files
folder_path = 'phylo'

# Process each CSV file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Check if the DataFrame is not empty
        if not df.empty:
            # Drop the first column
            df = df.iloc[:, 1:]
            
            # Insert the new "Position" column
            df.insert(0, 'Position', range(1, len(df) + 1))
        
            # Save the modified CSV file
            df.to_csv(file_path, index=False)

print("Processing complete.")

Processing complete.


In [121]:
import os
import pandas as pd

# Define the folder paths
folder1_path = 'combine_f1f2'
folder2_path = 'phylo'
output_folder_path = 'combine_f1f2target'

# Ensure the output folder exists
os.makedirs(output_folder_path, exist_ok=True)

# Function to extract the common part of the filename
def get_common_part(filename, folder):
    if folder == 1:
        return filename.split('|')[1]
    elif folder == 2:
        return os.path.splitext(filename)[0]

# Process files in the first folder
for file1 in os.listdir(folder1_path):
    if file1.endswith('.csv'):
        common_part1 = get_common_part(file1, 1)
        print(f"Processing file from folder 1: {file1}, common part: {common_part1}")
        
        # Find the corresponding file in the second folder
        for file2 in os.listdir(folder2_path):
            if file2.endswith('.csv'):
                common_part2 = get_common_part(file2, 2)
                print(f"Checking file from folder 2: {file2}, common part: {common_part2}")
                
                if common_part1 == common_part2:
                    print(f"Match found: {file1} and {file2}")
                    
                    # Read the CSV files
                    df1 = pd.read_csv(os.path.join(folder1_path, file1))
                    df2 = pd.read_csv(os.path.join(folder2_path, file2))
                    
                    # Merge the dataframes on the 'Position' column
                    merged_df = pd.merge(df1, df2, on='Position')
                    
                    # Save the merged dataframe to the output folder
                    output_filename = f"{common_part1}.csv"
                    merged_df.to_csv(os.path.join(output_folder_path, output_filename), index=False)
                    print(f"File saved: {output_filename}")
                    break

print("Processing complete.")

Processing file from folder 1: fig|11069.3967.mat_peptide.10|.csv, common part: 11069.3967.mat_peptide.10
Checking file from folder 2: 11069.3592.mat_peptide.9.csv, common part: 11069.3592.mat_peptide.9
Checking file from folder 2: 11069.779.mat_peptide.9.csv, common part: 11069.779.mat_peptide.9
Checking file from folder 2: 11069.440.mat_peptide.9.csv, common part: 11069.440.mat_peptide.9
Checking file from folder 2: 11069.3117.mat_peptide.9.csv, common part: 11069.3117.mat_peptide.9
Checking file from folder 2: 11069.219.mat_peptide.9.csv, common part: 11069.219.mat_peptide.9
Checking file from folder 2: 11069.3655.mat_peptide.9.csv, common part: 11069.3655.mat_peptide.9
Checking file from folder 2: 11069.359.mat_peptide.9.csv, common part: 11069.359.mat_peptide.9
Checking file from folder 2: 11069.5787.mat_peptide.9.csv, common part: 11069.5787.mat_peptide.9
Checking file from folder 2: 11069.5792.mat_peptide.9.csv, common part: 11069.5792.mat_peptide.9
Checking file from folder 2: 

In [123]:
import os
import pandas as pd

# Define the folder paths
entropy_file_path = 'entropy_data.csv'
input_folder_path = 'combine_f1f2target'
output_folder_path = 'training_files'

# Ensure the output folder exists
os.makedirs(output_folder_path, exist_ok=True)

# Read the Entropy file
entropy_df = pd.read_csv(entropy_file_path)
entropy_column = entropy_df['Entropy']

# Process each CSV file in the input folder
for filename in os.listdir(input_folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_folder_path, filename)
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Drop the first column
        df = df.drop(columns=['Position'])
        
        # Concatenate the Entropy column as F4 between F3 and Target
        df = pd.concat([df.iloc[:, :2], entropy_column, df.iloc[:, 2:]], axis=1)
        df.columns = ['F1', 'F3', 'F4', 'Target']
        
        # Save the modified CSV file to the output folder
        output_file_path = os.path.join(output_folder_path, filename)
        df.to_csv(output_file_path, index=False)

print("Processing complete.")

Processing complete.
