In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m133.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.linalg import eigvals
from Bio import SeqIO
from scipy.linalg import eig




# ME: Calculates the eigenvalue-based metric from matrix W
def ME(W):
    if W.ndim == 1:
        # If W is a 1D array, reshape it to 2D
        W = W.reshape(-1, 1)

    W = W[1:, :]  # Remove the first row
    D = pdist(W)  # Pairwise distances
    E = squareform(D)

    x = W.shape[0]
    sdist = np.zeros((x, x))

    # Get the eigenvalues and find the largest
    eigvals = eig(L, right=False)
    largest_eigval = np.max(eigvals.real)  # Use the real part of the eigenvalues

    return largest_eigval / x



# GRS: Calculates the geometric representation of the sequence in space
def GRS(seq, P, V, M):
    l_seq = len(seq)
    k = M.shape[0]

    g = []
    for j in range(k):
        c = np.zeros(3)
        d = np.zeros(3)
        y = np.zeros(20)

        for i in range(l_seq):
            x = (seq[i] == M[j, :])

            if i == 0:
                c = c + x.dot(P)
            elif all(x == 0):
                d = d * (i - 1) / i
                c = c + np.array([0, 0, 1]) + d
            elif all(y == 0):
                d = d * (i - 1) / i
                c = c + x.dot(P) + d
            else:
                d = d * (i - 1) / i + V[np.where(y == 1)[0][0], np.where(x == 1)[0][0]] / i
                c = c + x.dot(P) + d

            y = x

        g.append(c)

    return np.array(g)


# Coordinate: Generates 3D coordinates for 20 amino acids
def coordinate():
    P = np.zeros((20, 3))
    V = np.zeros((20, 20, 3))

    for i in range(20):
        P[i] = [np.cos(i * 2 * np.pi / 20), np.sin(i * 2 * np.pi / 20), 1]

    for i in range(20):
        for j in range(20):
            V[i, j] = P[i] + 0.25 * (P[j] - P[i])

    return P, V


# FEGS: Extract features from sequences in a FASTA file
def FEGS(fasta_file):
    P, V = coordinate()

    # Load sequences from the FASTA file
    sequences = [str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")]
    l = len(sequences)

    # Initialize results
    g_p = []
    EL = np.zeros((l, 158))
    FA = np.zeros((l, 20))
    FD = np.zeros((l, 400))
    char = "ARNDCQEGHILKMFPSTWYV"

    # Parallel processing setup (optional, can use joblib or multiprocessing)

    # Combine all features
    FV = np.hstack([EL, FA, FD])

    return FV

# Load sequences from a specific column in a CSV file
def load_sequences_from_csv(file_path, column_name):
    """
    Load sequences from a specific column in a CSV file.
    :param file_path: Path to the CSV file.
    :param column_name: Name of the column containing the sequences.
    :return: List of sequences.
    """
    df = pd.read_csv(file_path)
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the CSV file.")
    return df[column_name].dropna().astype(str).tolist()


path = '/content/drive/MyDrive/Watashara_Projects/neurotoxic/'

# Load sequences from the CSV file
# sequences = load_sequences_from_csv(csv_file, column_name)

features = FEGS(path + "Features_extraction/datasets/combined/independent_dataset_combined.fasta")


pd.DataFrame(features).to_csv(path + 'features/combined/IND_FEGS.csv',index=False)

