In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

import re, os, sys
from collections import Counter
import math
import numpy as np
import re
#from readToMatrix import *
def read_protein_sequences(file):
    if os.path.exists(file) == False:
        print('Error: file %s does not exist.' % file)
        sys.exit(1)
    with open(file) as f:
        records = f.read()
    if re.search('>', records) == None:
        print('Error: the input file %s seems not in FASTA format!' % file)
        sys.exit(1)
    records = records.split('>')[1:]
    fasta_sequences = []
    for fasta in records:
        array = fasta.split('\n')
        header, sequence = array[0].split()[0], re.sub('[^ACDEFGHIKLMNPQRSTVWY-]', '', ''.join(array[1:]).upper())
        header_array = header.split('|')
        name = header_array[0]
        #label = 'None' #header_array[1] if len(header_array) >= 1 else '0'
        #label_train = 'None' #header_array[2] if len(header_array) >= 2 else 'training'
        fasta_sequences.append([name, sequence])
    return fasta_sequences


def AAC(fastas, **kw):
    AA = 'ACDEFGHIKLMNPQRSTVWY'
    #AA = 'ARNDCQEGHILKMFPSTWYV'
    encodings = []
    header = []
    for i in AA:
        header.append(i)
    #encodings.append(header)

    for i in fastas:
        name, sequence = i[0], re.sub('-', '', i[1])
        count = Counter(sequence)
        for key in count:
            count[key] = count[key]/len(sequence)
        code = []
        for aa in AA:
            code.append(count[aa])
        encodings.append(code)
    return np.array(encodings, dtype=float)


def AAINDEX(fastas, props=None, **kw):
    AA = 'ARNDCQEGHILKMFPSTWYV'
    fileAAindex = '/content/drive/MyDrive/Watashara_Projects/data/AAindex.txt'
    with open(fileAAindex) as f:
        records = f.readlines()[1:]

    AAindex = []
    AAindexName = []
    for i in records:
        AAindex.append(i.rstrip().split()[1:] if i.rstrip() != '' else None)
        AAindexName.append(i.rstrip().split()[0] if i.rstrip() != '' else None)

    index = {}
    for i in range(len(AA)):
        index[AA[i]] = i

    #  use the user inputed properties
    if props:
        tmpIndexNames = []
        tmpIndex = []
        for p in props:
            if AAindexName.index(p) != -1:
                tmpIndexNames.append(p)
                tmpIndex.append(AAindex[AAindexName.index(p)])
        if len(tmpIndexNames) != 0:
            AAindexName = tmpIndexNames
            AAindex = tmpIndex

    header = []
    for idName in AAindexName:
        header.append(idName)

    encodings = []
    for i in fastas:
        name, sequence = i[0], i[1]
        code = []

        for j in AAindex:
            tmp = 0
            for aa in sequence:
                if aa == '-':
                    tmp = tmp + 0
                else:
                    tmp = tmp + float(j[index[aa]])
            code.append(tmp/len(sequence))
        encodings.append(code)
    return np.array(encodings, dtype=float)

def APAAC(fastas, lambdaValue=1, w=0.05, **kw):
    records = []
    records.append("#   A   R   N   D   C   Q   E   G   H   I   L   K   M   F   P   S   T   W   Y   V")
    records.append("Hydrophobicity  0.62    -2.53   -0.78   -0.9    0.29    -0.85   -0.74   0.48    -0.4    1.38    1.06    -1.5    0.64    1.19    0.12    -0.18   -0.05   0.81    0.26    1.08")
    records.append("Hydrophilicity  -0.5    3   0.2 3   -1  0.2 3   0   -0.5    -1.8    -1.8    3   -1.3    -2.5    0   0.3 -0.4    -3.4    -2.3    -1.5")
    records.append("SideChainMass   15  101 58  59  47  72  73  1   82  57  57  73  75  91  42  31  45  130 107 43")

    AA = ''.join(records[0].rstrip().split()[1:])
    AADict = {AA[i]: i for i in range(len(AA))}
    AAProperty = []
    AAPropertyNames = []

    for i in range(1, len(records) - 1):
        array = records[i].rstrip().split() if records[i].rstrip() != '' else None
        AAProperty.append([float(j) for j in array[1:]])
        AAPropertyNames.append(array[0])

    AAProperty1 = []
    for i in AAProperty:
        meanI = sum(i) / 20
        fenmu = math.sqrt(sum([(j - meanI) ** 2 for j in i]) / 20)
        AAProperty1.append([(j - meanI) / fenmu for j in i])

    encodings = []
    header = []
    for i in AA:
        header.append('Pc1.' + i)
    for j in range(1, lambdaValue + 1):
        for i in AAPropertyNames:
            header.append('Pc2.' + i + '.' + str(j))

    # Filter out short sequences
    filtered_fastas = [(sequence) for sequence in fastas if len(sequence) > lambdaValue]
    if len(filtered_fastas) != len(fastas):
        print(f"Filtered out {len(fastas) - len(filtered_fastas)} sequences that were too short.")

    for i in filtered_fastas:
        name, sequence = i[0], re.sub('-', '', i[1])
        code = []
        theta = []

        for n in range(1, lambdaValue + 1):
            for j in range(len(AAProperty1)):  # len(AAProperty1) should be 5 for the 5 properties
                # Ensure valid length before computing
                if len(sequence) - n > 0:
                    theta_value = sum(
                        AAProperty1[j][AADict[sequence[k]]] * AAProperty1[j][AADict[sequence[k + n]]]
                        for k in range(len(sequence) - n)
                    ) / (len(sequence) - n)
                    theta.append(theta_value)
                else:
                    theta.append(0)

        myDict = {aa: sequence.count(aa) for aa in AA}

        code = code + [myDict[aa] / (1 + w * sum(theta)) for aa in AA]
        code = code + [w * value / (1 + w * sum(theta)) for value in theta]

        encodings.append(code)

    return np.array(encodings, dtype=float)


def BLOSUM62(fastas, **kw):
    AA = 'ARNDCQEGHILKMFPSTWYV'
    blosum62 = {
        'A': [4,  -1, -2, -2, 0,  -1, -1, 0, -2,  -1, -1, -1, -1, -2, -1, 1,  0,  -3, -2, 0],  # A
        'R': [-1, 5,  0,  -2, -3, 1,  0,  -2, 0,  -3, -2, 2,  -1, -3, -2, -1, -1, -3, -2, -3], # R
        'N': [-2, 0,  6,  1,  -3, 0,  0,  0,  1,  -3, -3, 0,  -2, -3, -2, 1,  0,  -4, -2, -3], # N
        'D': [-2, -2, 1,  6,  -3, 0,  2,  -1, -1, -3, -4, -1, -3, -3, -1, 0,  -1, -4, -3, -3], # D
        'C': [0,  -3, -3, -3, 9,  -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1], # C
        'Q': [-1, 1,  0,  0,  -3, 5,  2,  -2, 0,  -3, -2, 1,  0,  -3, -1, 0,  -1, -2, -1, -2], # Q
        'E': [-1, 0,  0,  2,  -4, 2,  5,  -2, 0,  -3, -3, 1,  -2, -3, -1, 0,  -1, -3, -2, -2], # E
        'G': [0,  -2, 0,  -1, -3, -2, -2, 6,  -2, -4, -4, -2, -3, -3, -2, 0,  -2, -2, -3, -3], # G
        'H': [-2, 0,  1,  -1, -3, 0,  0,  -2, 8,  -3, -3, -1, -2, -1, -2, -1, -2, -2, 2,  -3], # H
        'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4,  2,  -3, 1,  0,  -3, -2, -1, -3, -1, 3],  # I
        'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2,  4,  -2, 2,  0,  -3, -2, -1, -2, -1, 1],  # L
        'K': [-1, 2,  0,  -1, -3, 1,  1,  -2, -1, -3, -2, 5,  -1, -3, -1, 0,  -1, -3, -2, -2], # K
        'M': [-1, -1, -2, -3, -1, 0,  -2, -3, -2, 1,  2,  -1, 5,  0,  -2, -1, -1, -1, -1, 1],  # M
        'F': [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0,  0,  -3, 0,  6,  -4, -2, -2, 1,  3,  -1], # F
        'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7,  -1, -1, -4, -3, -2], # P
        'S': [1,  -1, 1,  0,  -1, 0,  0,  0,  -1, -2, -2, 0,  -1, -2, -1, 4,  1,  -3, -2, -2], # S
        'T': [0,  -1, 0,  -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1,  5,  -2, -2, 0],  # T
        'W': [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1,  -4, -3, -2, 11, 2,  -3], # W
        'Y': [-2, -2, -2, -3, -2, -1, -2, -3, 2,  -1, -1, -2, -1, 3,  -3, -2, -2, 2,  7,  -1], # Y
        'V': [0,  -3, -3, -3, -1, -2, -2, -3, -3, 3,  1,  -2, 1,  -1, -2, -2, 0,  -3, -1, 4],  # V
        '-': [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],  # -
    }
    encodings = []
    header = []
    for i in range(0,20):
        header.append('blosum62.F'+str(AA[i]))

    for i in fastas:
        name, sequence = i[0], i[1]
        code = np.asarray([0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
        for aa in sequence:
            code = code + np.asarray(blosum62[aa])
        encodings.append(list(code/len(sequence)))
    return np.array(encodings, dtype=float)



def DPC(fastas, gap, **kw):
    AA = 'ACDEFGHIKLMNPQRSTVWY'
    encodings = []
    diPeptides = [aa1 + aa2 for aa1 in AA for aa2 in AA]
    header = [] + diPeptides

    AADict = {}
    for i in range(len(AA)):
        AADict[AA[i]] = i

    for i in fastas:
        name, sequence = i[0], re.sub('-', '', i[1])
        code = []
        tmpCode = [0] * 400
        for j in range(len(sequence) - 2 + 1 - gap):
            tmpCode[AADict[sequence[j]] * 20 + AADict[sequence[j+gap+1]]] = tmpCode[AADict[sequence[j]] * 20 + AADict[sequence[j+gap+1]]] +1
        if sum(tmpCode) != 0:
            tmpCode = [i/sum(tmpCode) for i in tmpCode]
        code = code + tmpCode
        encodings.append(code)
    return np.array(encodings, dtype=float)


def PAAC(fastas, lambdaValue=1, w=0.05, **kw):
    records = []
    records.append("#   A   R   N   D   C   Q   E   G   H   I   L   K   M   F   P   S   T   W   Y   V")
    records.append("Hydrophobicity  0.62    -2.53   -0.78   -0.9    0.29    -0.85   -0.74   0.48    -0.4    1.38    1.06    -1.5    0.64    1.19    0.12    -0.18   -0.05   0.81    0.26    1.08")
    records.append("Hydrophilicity  -0.5    3   0.2 3   -1  0.2 3   0   -0.5    -1.8    -1.8    3   -1.3    -2.5    0   0.3 -0.4    -3.4    -2.3    -1.5")
    records.append("SideChainMass   15  101 58  59  47  72  73  1   82  57  57  73  75  91  42  31  45  130 107 43")
    AA = ''.join(records[0].rstrip().split()[1:])
    AADict = {}
    for i in range(len(AA)):
        AADict[AA[i]] = i
    AAProperty = []
    AAPropertyNames = []
    for i in range(1, len(records)):
        array = records[i].rstrip().split() if records[i].rstrip() != '' else None
        AAProperty.append([float(j) for j in array[1:]])
        AAPropertyNames.append(array[0])

    AAProperty1 = []
    for i in AAProperty:
        meanI = sum(i) / 20
        fenmu = math.sqrt(sum([(j - meanI) ** 2 for j in i]) / 20)
        AAProperty1.append([(j - meanI) / fenmu for j in i])

    encodings = []
    header = []
    for aa in AA:
        header.append('Xc1.' + aa)
    for n in range(1, lambdaValue + 1):
        header.append('Xc2.lambda' + str(n))

    for i in fastas:
        name, sequence= i[0], re.sub('-', '', i[1])
        code = []
        theta = []
        for n in range(1, lambdaValue + 1):
            theta.append(
                sum([Rvalue(sequence[j], sequence[j + n], AADict, AAProperty1) for j in range(len(sequence) - n)]) / (
                    len(sequence) - n))
        myDict = {}
        for aa in AA:
            myDict[aa] = sequence.count(aa)
        code = code + [myDict[aa] / (1 + w * sum(theta)) for aa in AA]
        code = code + [(w * j) / (1 + w * sum(theta)) for j in theta]
        encodings.append(code)
    return np.array(encodings, dtype=float)



def TPC(fastas, **kw):
   # AA = kw['order'] if kw['order'] != None else 'ACDEFGHIKLMNPQRSTVWY'
    AA = 'ACDEFGHIKLMNPQRSTVWY'
    encodings = []
    #triPeptides = [aa1 + aa2 + aa3 for aa1 in AA for aa2 in AA for aa3 in AA]
    #header = ['#', 'label'] + triPeptides
    #encodings.append(header)

    AADict = {}
    for i in range(len(AA)):
        AADict[AA[i]] = i

    for i in fastas:
        name, sequence = i[0], re.sub('-', '', i[1])
        code = []
        tmpCode = [0] * 8000
        for j in range(len(sequence) - 3 + 1):
            tmpCode[AADict[sequence[j]] * 400 + AADict[sequence[j+1]]*20 + AADict[sequence[j+2]]] = tmpCode[AADict[sequence[j]] * 400 + AADict[sequence[j+1]]*20 + AADict[sequence[j+2]]] +1
        if sum(tmpCode) != 0:
            tmpCode = [i/sum(tmpCode) for i in tmpCode]
        code = code + tmpCode
        encodings.append(code)
    return np.array(encodings, dtype=float)

def reducedACID(seq):
    def fcount(string, substr):
       count = 0
       pos = 0
       while(True):
           pos = string.find(substr , pos)
           if pos > -1:
               count = count + 1
               pos += 1
           else:
               break
       return count

    for count, fasta in enumerate(seq):
        sub = "akn"
        subsub = [it1+it2 for it1 in sub for it2 in sub]
        aalist = "ACDEFGHIKLMNPQRSTVWY"
        aasub = {}
        aasub["a"] = "DE"
        aasub["k"] = "KHR"
        aasub["n"] = "ACFGILMNPQSTVWY"

        seq1 = fasta[1]
        lenn=len(seq1)
        seq2 = seq1
        for key, value in aasub.items():
            for aa in value:
                seq2 = seq2.replace(aa,key)

        freq2 = {}
        for item in sub:
            freq2[item] = fcount(seq2, item)
        for item in subsub:
            freq2[item] = fcount(seq2, item)

        freq1 = {}
        for item in aalist:
            freq1[item] = fcount(seq1, item)

        feat = []
        for key, value in aasub.items():
            feat.append(freq2[key]/lenn)

        for item in aalist:
            for key, value in aasub.items():
                if item in value:
                    feat.append(freq1[item]/max(1,freq2[key]))
                    break

        for item in subsub:
            feat.append(freq2[item]/(freq2[item[0]]+1))

        feat = np.array(feat)
        feat = feat.reshape(1,len(feat))
        if count == 0:
            allfeat = feat
        else:
            allfeat = np.vstack((allfeat, feat))

    return allfeat


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
