In [20]:
from propy import PyPro
import pandas as pd
import numpy as np

In [21]:
data_dir = 'raw_fasta'

def extractProteinSequenceFromFasta(file):
    # read text file in FASTA format
    with open(file, 'r') as f:
        lines = f.readlines()
    # remove new line characters
    lines = [line.strip() for line in lines]
    # remove empty lines
    lines = [line for line in lines if line != '']
    # odd ids are protein sequences
    protein_sequences = lines[1::2]
    # even ids are protein ids
    protein_ids = lines[::2]
    # return protein sequences
    return protein_ids, protein_sequences

# function to extract AAC features from a given FASTA format txt file using propy3
def extractFeatureDF(protein_ids, protein_sequences, feature_type, negative):
    df = pd.DataFrame()
    # iterate over protein sequences
    for i in range(len(protein_sequences)):
        try:
            # get protein sequence
            protein = PyPro.GetProDes(protein_sequences[i])
            if feature_type == 'AAC':
                extractedFeatures = protein.GetAAComp()
            elif feature_type == 'APAAC':
                extractedFeatures = protein.GetAPAAC()
            elif feature_type == 'CTD':
                extractedFeatures = protein.GetCTD()
            elif feature_type == 'PAAC':
                extractedFeatures = protein.GetPAAC()
            elif feature_type == 'DPC':
                extractedFeatures = protein.GetDPComp()
            # convert dictionary to pandas dataframe
            df1 = pd.DataFrame.from_dict(extractedFeatures, orient='index').transpose()
            df1['id'] = protein_ids[i][1:]
            # add dataframe to main dataframe with df.concat
            df = pd.concat([df, df1], ignore_index=True)
            print(feature_type, f"Extracted features for sequence {i}", negative)
        except ZeroDivisionError:
            print(f"Skipping sequence {i} due to ZeroDivisionError")
            continue
    if negative:
        df['label'] = 0
    else:
        df['label'] = 1
    # return AAC features dataframe
    return df

def combineNegativeAndPositiveDFs(negativeFile, positiveFile, feature_type):
    # extract protein ids and sequences from negative FASTA file
    negative_ids, negative_sequences = extractProteinSequenceFromFasta(negativeFile)
    # extract protein ids and sequences from positive FASTA file
    positive_ids, positive_sequences = extractProteinSequenceFromFasta(positiveFile)
    # extract feature_type from negative FASTA file
    negativeDF = extractFeatureDF(negative_ids, negative_sequences, feature_type, negative=True)
    # extract feature_type from positive FASTA file
    positiveDF = extractFeatureDF(positive_ids, positive_sequences, feature_type, negative=False)
    # combine positive and negative dataframes
    combinedDF = pd.concat([negativeDF, positiveDF], ignore_index=True)
    # shuffle dataframe
    combinedDF = combinedDF.sample(frac=1).reset_index(drop=True)
    # return combined dataframe
    return combinedDF

In [22]:
combineNegativeAndPositiveDFs(f'{data_dir}/TR_neg_SPIDER.txt', f'{data_dir}/TR_pos_SPIDER.txt', 'AAC').to_csv(f'processed_dataset/TR_AAC.csv', index=False)
combineNegativeAndPositiveDFs(f'{data_dir}/TR_neg_SPIDER.txt', f'{data_dir}/TR_pos_SPIDER.txt', 'APAAC').to_csv(f'processed_dataset/TR_APAAC.csv', index=False)
combineNegativeAndPositiveDFs(f'{data_dir}/TR_neg_SPIDER.txt', f'{data_dir}/TR_pos_SPIDER.txt', 'CTD').to_csv(f'processed_dataset/TR_CTD.csv', index=False)
combineNegativeAndPositiveDFs(f'{data_dir}/TR_neg_SPIDER.txt', f'{data_dir}/TR_pos_SPIDER.txt', 'PAAC').to_csv(f'processed_dataset/TR_PAAC.csv', index=False)
combineNegativeAndPositiveDFs(f'{data_dir}/TR_neg_SPIDER.txt', f'{data_dir}/TR_pos_SPIDER.txt', 'DPC').to_csv(f'processed_dataset/TR_DPC.csv', index=False)

AAC Extracted features for sequence 0 True
AAC Extracted features for sequence 1 True
AAC Extracted features for sequence 2 True
AAC Extracted features for sequence 3 True
AAC Extracted features for sequence 4 True
AAC Extracted features for sequence 5 True
AAC Extracted features for sequence 6 True
AAC Extracted features for sequence 7 True
AAC Extracted features for sequence 8 True
AAC Extracted features for sequence 9 True
AAC Extracted features for sequence 10 True
AAC Extracted features for sequence 11 True
AAC Extracted features for sequence 12 True
AAC Extracted features for sequence 13 True
AAC Extracted features for sequence 14 True
AAC Extracted features for sequence 15 True
AAC Extracted features for sequence 16 True
AAC Extracted features for sequence 17 True
AAC Extracted features for sequence 18 True
AAC Extracted features for sequence 19 True
AAC Extracted features for sequence 20 True
AAC Extracted features for sequence 21 True
AAC Extracted features for sequence 22 Tru

In [23]:
combineNegativeAndPositiveDFs(f'{data_dir}/TS_neg_SPIDER.txt', f'{data_dir}/TS_pos_SPIDER.txt', 'AAC').to_csv(f'processed_dataset/TS_AAC.csv', index=False)
combineNegativeAndPositiveDFs(f'{data_dir}/TS_neg_SPIDER.txt', f'{data_dir}/TS_pos_SPIDER.txt', 'APAAC').to_csv(f'processed_dataset/TS_APAAC.csv', index=False)
combineNegativeAndPositiveDFs(f'{data_dir}/TS_neg_SPIDER.txt', f'{data_dir}/TS_pos_SPIDER.txt', 'CTD').to_csv(f'processed_dataset/TS_CTD.csv', index=False)
combineNegativeAndPositiveDFs(f'{data_dir}/TS_neg_SPIDER.txt', f'{data_dir}/TS_pos_SPIDER.txt', 'PAAC').to_csv(f'processed_dataset/TS_PAAC.csv', index=False)
combineNegativeAndPositiveDFs(f'{data_dir}/TS_neg_SPIDER.txt', f'{data_dir}/TS_pos_SPIDER.txt', 'DPC').to_csv(f'processed_dataset/TS_DPC.csv', index=False)

AAC Extracted features for sequence 0 True
AAC Extracted features for sequence 1 True
AAC Extracted features for sequence 2 True
AAC Extracted features for sequence 3 True
AAC Extracted features for sequence 4 True
AAC Extracted features for sequence 5 True
AAC Extracted features for sequence 6 True
AAC Extracted features for sequence 7 True
AAC Extracted features for sequence 8 True
AAC Extracted features for sequence 9 True
AAC Extracted features for sequence 10 True
AAC Extracted features for sequence 11 True
AAC Extracted features for sequence 12 True
AAC Extracted features for sequence 13 True
AAC Extracted features for sequence 14 True
AAC Extracted features for sequence 15 True
AAC Extracted features for sequence 16 True
AAC Extracted features for sequence 17 True
AAC Extracted features for sequence 18 True
AAC Extracted features for sequence 19 True
AAC Extracted features for sequence 20 True
AAC Extracted features for sequence 21 True
AAC Extracted features for sequence 22 Tru