In [None]:
"""
Unified IDP Analysis: Nardini+ features, CIDER parameters, and SPARROW parameters
All in one environment, all in one output file
Uses the proven Nardini calculation method with added CIDER and SPARROW features

FEATURES INCLUDED:
- Nardini: 54 features (raw + z-score) - compositional, physicochemical, patches
- CIDER: 6 features - kappa, omega, delta, mean_net_charge, uversky_hydropathy, fraction_neutral
- SPARROW: 7 features - SCD, SHD, complexity, fraction_proline,
           scaled_rg, scaled_re, prefactor, scaling_exponent, asphericity

Z-SCORE HANDLING:
- Z-scores set to NaN when raw value is 0 (division by zero protection)
- Z-scores set to NaN when reference std dev is near-zero

"""

import numpy as np
import pandas as pd
import re
from localcider.sequenceParameters import SequenceParameters as CIDER_SP
from sparrow import Protein


In [None]:
import pandas as pd
import numpy as np
import re
import os
import torch
from localcider.sequenceParameters import SequenceParameters as CIDER_SP
from sparrow import Protein
import sparrow
from parrot import brnn_architecture, encode_sequence
from sparrow.sparrow_exceptions import SparrowException
from sparrow.predictors.dssp.dssp_predictor import DSSPPredictor
from sparrow.predictors.mitochondrial_targeting.mitochondrial_targeting_predictor import MitochondrialTargetingPredictor

# ============================================================================
# USER INPUTS
# ============================================================================

input_fasta_file = 'test_seqs.fasta'
output_filename = 'test_idp_features'
filetype = "csv"  # "csv" or "excel"

calculate_zscores = True
reference_species = "Saccharomyces cerevisiae"
# Options: "Homo sapiens", "Mus musculus", "Rattus norvegicus", "Xenopus tropicalis",
# "Drosophila melanogaster", "Danio rerio", "Saccharomyces cerevisiae",
# "Caenorhabditis elegans", "Arabidopsis thaliana"

# ============================================================================
# PREDICTOR CLASSES (NLS and NES)
# ============================================================================

def softmax(v):
    return (np.e ** v) / np.sum(np.e ** v)

class NLSPredictor():
    """Predicts Nuclear Import Signals (NIS) - per-residue probability scores."""
    def __init__(self, version="1"):
        saved_weights = sparrow.get_data(f'networks/nuclear_import_signal/nls_predictor_network_v{version}.pt')
        if not os.path.isfile(saved_weights):
            raise SparrowException(f'Error: could not find weights file {saved_weights}')
        
        loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
        
        num_layers = 0
        while f'lstm.weight_ih_l{num_layers}' in loaded_model:
            num_layers += 1
        
        self.number_of_classes = np.shape(loaded_model['fc.bias'])[0]
        self.input_size = 20
        self.hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
        self.number_of_layers = num_layers
        
        self.network = brnn_architecture.BRNN_MtM(
            self.input_size, self.hidden_vector_size, num_layers, self.number_of_classes, 'cpu'
        )
        self.network.load_state_dict(loaded_model)

    def predict_nuclear_import_signal(self, seq):
        seq = seq.upper()
        seq_vector = encode_sequence.one_hot(seq).view(1, len(seq), -1)
        prediction = self.network(seq_vector.float()).detach().numpy().flatten()
        prediction = prediction.reshape(-1, self.number_of_classes)
        prediction = np.array(list(map(softmax, prediction)))
        return [round(val[1], 5) for val in prediction]

class NESPredictor():
    """Predicts Nuclear Export Signals (NES) - per-residue probability scores."""
    def __init__(self, version="1"):
        saved_weights = sparrow.get_data(f'networks/nuclear_export_signal/nes_predictor_network_v{version}.pt')
        if not os.path.isfile(saved_weights):
            raise SparrowException(f'Error: could not find weights file {saved_weights}')
        
        loaded_model = torch.load(saved_weights, map_location=torch.device('cpu'))
        
        num_layers = 0
        while f'lstm.weight_ih_l{num_layers}' in loaded_model:
            num_layers += 1
        
        self.number_of_classes = np.shape(loaded_model['fc.bias'])[0]
        self.input_size = 20
        self.hidden_vector_size = int(np.shape(loaded_model['lstm.weight_ih_l0'])[0] / 4)
        self.number_of_layers = num_layers
        
        self.network = brnn_architecture.BRNN_MtM(
            self.input_size, self.hidden_vector_size, num_layers, self.number_of_classes, 'cpu'
        )
        self.network.load_state_dict(loaded_model)

    def predict_nes(self, seq):
        seq = seq.upper()
        seq_vector = encode_sequence.one_hot(seq).view(1, len(seq), -1)
        prediction = self.network(seq_vector.float()).detach().numpy().flatten()
        prediction = prediction.reshape(-1, self.number_of_classes)
        prediction = np.array(list(map(softmax, prediction)))
        return np.array([val[1] for val in prediction])

# ============================================================================
# INITIALIZE SPARROW PREDICTORS
# ============================================================================

print("Initializing SPARROW predictors...")
dssp_predictor = DSSPPredictor()
mito_predictor = MitochondrialTargetingPredictor()
nes_predictor = NESPredictor()
nls_predictor = NLSPredictor()
print("Predictors initialized.\n")

# ============================================================================
# LOAD REFERENCE DATA FOR Z-SCORES
# ============================================================================

if calculate_zscores:
    print("="*70)
    print(f"Loading reference data for: {reference_species}")
    print("="*70)
    
    sheetID = '1yxt0R1G0gdI2bGpjYXgk_h7-1qA6EY6J'
    worksheetName = reference_species.split(" ")[1]
    currurl = f'https://docs.google.com/spreadsheets/d/{sheetID}/gviz/tq?tqx=out:csv&sheet={worksheetName}'
    
    speciesdf = pd.read_csv(currurl)
    meanvals_species = speciesdf['Mean'].values
    stdvals_species = speciesdf['Std'].values
    mycompfeats_all = speciesdf['Feature'].tolist()
    print(f"Loaded {len(mycompfeats_all)} Nardini features.\n")
else:
    print("Z-score calculation disabled.\n")
    aas = 'ACDEFGHIKLMNPQRSTVY'
    mycompfeats_all = ['fracA', 'fracC', 'fracD', 'fracE', 'fracF', 'fracG', 'fracH', 'fracI',
                        'fracK', 'fracL', 'fracM', 'fracN', 'fracP', 'fracQ', 'fracR', 'fracS',
                        'fracT', 'fracV', 'fracW', 'fracY', 'fracpos', 'fracneg', 'fracpol',
                        'fracali', 'fracaro', 'fracRtoK', 'fracEtoD', 'fracexp', 'fcr', 'ncpr',
                        'mhydro', 'dispro', 'isopoi', 'ppii']
    for aa in aas:
        mycompfeats_all.append(f'patch{aa}')
    mycompfeats_all.append('patchRG')

# ============================================================================
# LOAD FASTA FILE
# ============================================================================

print(f"Loading FASTA file: {input_fasta_file}")
with open(input_fasta_file, 'r') as myfile:
    Lines = myfile.readlines()

subseqs, subnames = [], []
thisseq = ''

for line in Lines:
    cleanline = line.strip()
    if cleanline.startswith('>'):
        subnames.append(cleanline[1:])
        if thisseq:
            subseqs.append(thisseq.upper())
            thisseq = ''
    else:
        thisseq += cleanline
subseqs.append(thisseq.upper())
print(f"Loaded {len(subseqs)} sequences\n")

# ============================================================================
# CALCULATE NARDINI+ COMPOSITIONAL FEATURES
# ============================================================================

numInt, minBlockLen = 2, 4
aas = 'ACDEFGHIKLMNPQRSTVY'

# Initialize all feature lists
feat_lists = {name: [] for name in ['fracA','fracC','fracD','fracE','fracF','fracG','fracH','fracI',
    'fracK','fracL','fracM','fracN','fracP','fracQ','fracR','fracS','fracT','fracV','fracW','fracY',
    'fracpos','fracneg','fracpol','fracali','fracaro','fracRtoK','fracEtoD','fracexp','fcr','ncpr',
    'mhydro','dispro','isopoi','ppii']}
fracpatch = [[] for _ in range(len(aas))]
rgpatch = []

print("Calculating Nardini+ compositional features...")

for currseq in subseqs:
    if len(currseq) >= 1 and not any(x in currseq for x in "XUZJBO"):
        SeqOb = CIDER_SP(currseq)
        slen = SeqOb.get_length()
        aafrac = SeqOb.get_amino_acid_fractions()

        feat_lists['fracexp'].append(SeqOb.get_fraction_expanding())
        feat_lists['fcr'].append(SeqOb.get_FCR())
        feat_lists['ncpr'].append(SeqOb.get_NCPR())
        feat_lists['mhydro'].append(SeqOb.get_mean_hydropathy())
        feat_lists['dispro'].append(SeqOb.get_fraction_disorder_promoting())
        feat_lists['isopoi'].append(SeqOb.get_isoelectric_point())
        feat_lists['ppii'].append(SeqOb.get_PPII_propensity(mode='hilser'))

        for aa in 'ACDEFGHIKLMNPQRSTVWY':
            feat_lists[f'frac{aa}'].append(aafrac[aa])

        feat_lists['fracpos'].append(aafrac['K'] + aafrac['R'])
        feat_lists['fracneg'].append(aafrac['D'] + aafrac['E'])
        feat_lists['fracpol'].append(sum(aafrac[a] for a in 'QNSTGCH'))
        feat_lists['fracali'].append(sum(aafrac[a] for a in 'ALMIV'))
        feat_lists['fracaro'].append(sum(aafrac[a] for a in 'FWY'))
        feat_lists['fracRtoK'].append(np.log10((slen*aafrac['R']+1)/(slen*aafrac['K']+1)))
        feat_lists['fracEtoD'].append(np.log10((slen*aafrac['E']+1)/(slen*aafrac['D']+1)))

        # Patches for each amino acid
        for idx, aa in enumerate(aas):
            pos = [i for i, ltr in enumerate(currseq) if ltr == aa]
            pos2 = pos.copy()
            for p in range(len(pos) - 1):
                if 1 < pos[p+1] - pos[p] <= numInt + 1:
                    pos2.extend(range(pos[p]+1, pos[p+1]))
            
            justKs = ['0'] * len(currseq)
            for p in pos2:
                justKs[p] = '1'
            justKs = ''.join(justKs)
            
            patchescombined = ''
            for m in re.finditer(r"1+", justKs):
                subseq = currseq[m.start():m.end()]
                if subseq.count(aa) >= minBlockLen:
                    patchescombined += subseq
            fracpatch[idx].append(len(patchescombined) / len(currseq))

        # RG patch
        pos = [i for i, ltr in enumerate(currseq) if ltr in 'RG']
        pos2 = pos.copy()
        for p in range(len(pos) - 1):
            if 1 < pos[p+1] - pos[p] <= numInt + 1:
                pos2.extend(range(pos[p]+1, pos[p+1]))
        
        justKs = ['0'] * len(currseq)
        for p in pos2:
            justKs[p] = '1'
        justKs = ''.join(justKs)
        
        patchescombined = ''
        for m in re.finditer(r"1+", justKs):
            subseq = currseq[m.start():m.end()]
            if subseq.count('RG') >= 2:
                patchescombined += subseq
        rgpatch.append(len(patchescombined) / len(currseq))
    else:
        for key in feat_lists:
            feat_lists[key].append(np.nan)
        for idx in range(len(aas)):
            fracpatch[idx].append(np.nan)
        rgpatch.append(np.nan)

# Combine Nardini features
compfeatvals_all = [feat_lists[f'frac{aa}'] for aa in 'ACDEFGHIKLMNPQRSTVWY']
compfeatvals_all += [feat_lists[k] for k in ['fracpos','fracneg','fracpol','fracali','fracaro',
    'fracRtoK','fracEtoD','fracexp','fcr','ncpr','mhydro','dispro','isopoi','ppii']]
compfeatvals_all += fracpatch + [rgpatch]

print("Nardini+ features calculated.\n")

# ============================================================================
# CREATE NARDINI DATAFRAME
# ============================================================================

num_seqs, num_feats = len(subseqs), len(mycompfeats_all)
subrawcomp = np.array([[compfeatvals_all[f][s] for f in range(num_feats)] for s in range(num_seqs)])
raw_df = pd.DataFrame(data=subrawcomp, columns=mycompfeats_all)

if calculate_zscores:
    print("Calculating z-scores...")
    subzveccomp = np.zeros((num_seqs, num_feats))
    min_std = 1e-10
    
    for s in range(num_seqs):
        for f in range(num_feats):
            raw_val = compfeatvals_all[f][s]
            if raw_val == 0 or stdvals_species[f] < min_std:
                subzveccomp[s, f] = np.nan
            else:
                subzveccomp[s, f] = (raw_val - meanvals_species[f]) / stdvals_species[f]
    
    raw_df.columns = [f"nardini_{feat}_raw" for feat in mycompfeats_all]
    zscore_df = pd.DataFrame(subzveccomp, columns=[f"nardini_{feat}_zscore" for feat in mycompfeats_all])
    nardini_df = pd.concat([raw_df, zscore_df], axis=1)
else:
    raw_df.columns = [f"nardini_{feat}_raw" for feat in mycompfeats_all]
    nardini_df = raw_df

# ============================================================================
# CALCULATE CIDER AND SPARROW PARAMETERS
# ============================================================================

print("Calculating CIDER and SPARROW parameters...")
cider_data, sparrow_data = [], []

for idx, currseq in enumerate(subseqs):
    if len(currseq) < 1 or any(x in currseq for x in "XUZJBO"):
        cider_data.append({k: np.nan for k in ['cider_kappa','cider_omega','cider_delta',
            'cider_uversky_hydropathy','cider_fraction_neutral','cider_length']})
        sparrow_data.append({k: np.nan for k in ['sparrow_SCD','sparrow_SHD','sparrow_complexity',
            'sparrow_fraction_proline','sparrow_scaled_rg','sparrow_scaled_re','sparrow_prefactor',
            'sparrow_scaling_exponent','sparrow_asphericity']})
        continue
    
    # CIDER
    try:
        SeqOb = CIDER_SP(currseq)
        cider_data.append({
            'cider_kappa': SeqOb.get_kappa(),
            'cider_omega': SeqOb.get_Omega(),
            'cider_delta': SeqOb.get_delta(),
            'cider_uversky_hydropathy': SeqOb.get_uversky_hydropathy(),
            'cider_fraction_neutral': SeqOb.get_countNeut() / SeqOb.get_length(),
            'cider_length': SeqOb.get_length()
        })
    except:
        cider_data.append({k: np.nan for k in ['cider_kappa','cider_omega','cider_delta',
            'cider_uversky_hydropathy','cider_fraction_neutral','cider_length']})
    
    # SPARROW
    try:
        prot = Protein(currseq)
        sparrow_dict = {'sparrow_SCD': prot.SCD, 'sparrow_SHD': prot.SHD,
            'sparrow_complexity': prot.complexity, 'sparrow_fraction_proline': prot.fraction_proline}
        for attr, method in [('sparrow_scaled_rg', lambda: prot.predictor.radius_of_gyration(use_scaled=True)),
                             ('sparrow_scaled_re', lambda: prot.predictor.end_to_end_distance(use_scaled=True)),
                             ('sparrow_prefactor', lambda: prot.predictor.prefactor()),
                             ('sparrow_scaling_exponent', lambda: prot.predictor.scaling_exponent()),
                             ('sparrow_asphericity', lambda: prot.predictor.asphericity())]:
            try:
                sparrow_dict[attr] = method()
            except:
                sparrow_dict[attr] = np.nan
        sparrow_data.append(sparrow_dict)
    except:
        sparrow_data.append({k: np.nan for k in ['sparrow_SCD','sparrow_SHD','sparrow_complexity',
            'sparrow_fraction_proline','sparrow_scaled_rg','sparrow_scaled_re','sparrow_prefactor',
            'sparrow_scaling_exponent','sparrow_asphericity']})

cider_df = pd.DataFrame(cider_data)
sparrow_df = pd.DataFrame(sparrow_data)
print("CIDER and SPARROW calculation complete.\n")

# ============================================================================
# CALCULATE SPARROW PREDICTOR FEATURES (DSSP, Mito, NES, NIS)
# ============================================================================

print("Calculating SPARROW predictor features (DSSP, Mito, NES, NIS)...")
predictor_data = []

for idx, currseq in enumerate(subseqs):
    if (idx + 1) % 100 == 0:
        print(f"  Processing sequence {idx + 1}/{len(subseqs)}...")
    
    if len(currseq) < 1 or any(x in currseq for x in "XUZJBO"):
        predictor_data.append({
            'sparrow_avg_helix_prob': np.nan,
            'sparrow_avg_beta_prob': np.nan,
            'sparrow_avg_coil_prob': np.nan,
            'sparrow_avg_mito_targeting': np.nan,
            'sparrow_avg_nes': np.nan,
            'sparrow_avg_nis': np.nan
        })
        continue
    
    pred_dict = {}
    
    # DSSP secondary structure predictions
    try:
        dssp_probs = dssp_predictor.predict_dssp_probabilities(currseq)
        pred_dict['sparrow_avg_helix_prob'] = dssp_probs[:, 0].mean()
        pred_dict['sparrow_avg_beta_prob'] = dssp_probs[:, 1].mean()
        pred_dict['sparrow_avg_coil_prob'] = dssp_probs[:, 2].mean()
    except Exception as e:
        if idx == 0:
            print(f"  Note: DSSP prediction failed: {e}")
        pred_dict['sparrow_avg_helix_prob'] = np.nan
        pred_dict['sparrow_avg_beta_prob'] = np.nan
        pred_dict['sparrow_avg_coil_prob'] = np.nan
    
    # Mitochondrial targeting
    try:
        mito_pred = np.array(mito_predictor.predict_mitochondrial_targeting(currseq))
        pred_dict['sparrow_avg_mito_targeting'] = mito_pred.mean()
    except Exception as e:
        if idx == 0:
            print(f"  Note: Mito targeting prediction failed: {e}")
        pred_dict['sparrow_avg_mito_targeting'] = np.nan
    
    # Nuclear export signal (NES)
    try:
        nes_pred = nes_predictor.predict_nes(currseq)
        pred_dict['sparrow_avg_nes'] = nes_pred.mean()
    except Exception as e:
        if idx == 0:
            print(f"  Note: NES prediction failed: {e}")
        pred_dict['sparrow_avg_nes'] = np.nan
    
    # Nuclear import signal (NIS/NLS)
    try:
        nis_pred = np.array(nls_predictor.predict_nuclear_import_signal(currseq))
        pred_dict['sparrow_avg_nis'] = nis_pred.mean()
    except Exception as e:
        if idx == 0:
            print(f"  Note: NIS prediction failed: {e}")
        pred_dict['sparrow_avg_nis'] = np.nan
    
    predictor_data.append(pred_dict)

predictor_df = pd.DataFrame(predictor_data)
print("SPARROW predictor features calculated.\n")

# ============================================================================
# COMBINE ALL FEATURES
# ============================================================================

name_df = pd.DataFrame({'Name': subnames})
seq_df = pd.DataFrame({'Sequence': subseqs})

final_df = pd.concat([name_df, seq_df, nardini_df, cider_df, sparrow_df, predictor_df], axis=1)

print("="*70)
print("UNIFIED DATAFRAME SUMMARY")
print("="*70)
print(f"Total sequences: {len(final_df)}")
print(f"Total columns: {len(final_df.columns)}")
print(f"  - Nardini raw: {len([c for c in final_df.columns if 'nardini_' in c and '_raw' in c])}")
if calculate_zscores:
    print(f"  - Nardini z-scores: {len([c for c in final_df.columns if 'nardini_' in c and '_zscore' in c])}")
print(f"  - CIDER: {len([c for c in final_df.columns if c.startswith('cider_')])}")
print(f"  - SPARROW basic: {len([c for c in final_df.columns if c.startswith('sparrow_') and 'avg_' not in c])}")
print(f"  - SPARROW predictors: {len([c for c in final_df.columns if 'sparrow_avg_' in c])}")
print("="*70 + "\n")

# ============================================================================
# SAVE OUTPUT FILE
# ============================================================================

if filetype == 'csv':
    output_file = f'{output_filename}.csv'
    final_df.to_csv(output_file, index=False)
else:
    output_file = f'{output_filename}.xlsx'
    final_df.to_excel(output_file, index=False)

print(f'Saved to: {output_file}')

Initializing SPARROW predictors...
Predictors initialized.

Loading reference data for: Saccharomyces cerevisiae
Loaded 54 Nardini features.

Loading FASTA file: test_seqs.fasta
Loaded 4 sequences

Calculating Nardini+ compositional features...
Nardini+ features calculated.

Calculating z-scores...
Calculating CIDER and SPARROW parameters...
CIDER and SPARROW calculation complete.

Calculating SPARROW predictor features (DSSP, Mito, NES, NIS)...
SPARROW predictor features calculated.

UNIFIED DATAFRAME SUMMARY
Total sequences: 4
Total columns: 131
  - Nardini raw: 54
  - Nardini z-scores: 54
  - CIDER: 6
  - SPARROW basic: 9
  - SPARROW predictors: 6

Saved to: test_idp_features.csv
Analysis complete!
