In [None]:
##Mutations of brca2 based on raw data collected!

import re

def load_sequence(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read().replace('\n', '').strip()

def parse_variant_line(line):
    fields = line.strip().split('\t')
    if len(fields) < 3:
        return None
    variant_name = fields[0].strip()  # e.g. NM_000059.4(BRCA2):c.53G>A (p.Arg18His)
    protein_change = fields[2].strip()  # e.g. R18H
    if not protein_change.startswith("p."):
        protein_change = "p." + protein_change

    # Extract the c. variant (e.g., c.53G>A)
    c_variant_match = re.search(r'c\.[\d+_>ACGT]+', variant_name)
    variant_id = c_variant_match.group() if c_variant_match else variant_name
    return variant_id, protein_change

def apply_mutation(sequence, protein_change):
    three_to_one = {
        'Ala':'A','Arg':'R','Asn':'N','Asp':'D','Cys':'C',
        'Gln':'Q','Glu':'E','Gly':'G','His':'H','Ile':'I',
        'Leu':'L','Lys':'K','Met':'M','Phe':'F','Pro':'P',
        'Ser':'S','Thr':'T','Trp':'W','Tyr':'Y','Val':'V',
        'Sec':'U','Pyl':'O','Asx':'B','Glx':'Z','Xle':'J','Ter':'*'
    }

    def convert(aa):
        if len(aa) == 1:
            return aa.upper()
        return three_to_one.get(aa.capitalize(), None)

    match = re.match(r'p\.([A-Za-z]{1,3})(\d+)([A-Za-z]{1,3})', protein_change)
    if not match:
        return None

    aa_orig, pos_str, aa_new = match.groups()
    pos = int(pos_str) - 1
    orig_aa = convert(aa_orig)
    new_aa = convert(aa_new)

    if orig_aa is None or new_aa is None:
        return None
    if pos >= len(sequence):
        return None
    if sequence[pos] != orig_aa:
        # Warn but still try to apply the mutation
        print(f"Warning: expected {orig_aa} at position {pos+1}, found {sequence[pos]}")

    return sequence[:pos] + new_aa + sequence[pos+1:]

def process_variants(sequence, file_path, label):
    variants = []
    with open(file_path, 'r', encoding='utf-8') as f:
        header = f.readline()  # skip header
        for line in f:
            if not line.strip():
                continue
            parsed = parse_variant_line(line)
            if parsed is None:
                continue
            variant_id, protein_change = parsed
            mutated_seq = apply_mutation(sequence, protein_change)
            if mutated_seq:
                variants.append((variant_id, mutated_seq, label))
    return variants

def save_variants(variants, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("| Variant | Séquence Mutée | Label |\n")
        f.write("|---------|----------------|-------|\n")
        for var_id, mut_seq, label in variants:
            f.write(f"| {var_id} | {mut_seq} | {label} |\n")

def main():
    seq_file = r"C:\Users\REINA\BRCA2_Pathogenicity/data/raw/brca2_sequence.txt"
    benign_file = r"C:\Users\REINA\BRCA2_Pathogenicity/data/raw/clinvar_benign.txt"
    pathogenic_file = r"C:\Users\REINA\BRCA2_Pathogenicity/data/raw/clinvar_pathogenic.txt"
    output_file = r"C:\Users\REINA\BRCA2_Pathogenicity/data/processed/mutation/brca2_mutated_sequences.txt"

    reference_seq = load_sequence(seq_file)
    benign_variants = process_variants(reference_seq, benign_file, 0)
    pathogenic_variants = process_variants(reference_seq, pathogenic_file, 1)
    all_variants = benign_variants + pathogenic_variants
    save_variants(all_variants, output_file)

    print(f"{len(all_variants)} variants written to {output_file}")

if __name__ == "__main__":
    main()



In [None]:
EMBEDDINGS

In [None]:
import torch
from transformers import BertTokenizer, BertModel

# === Chargement du modèle ProtBERT ===
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert")
model.eval()

# === Fonction pour générer l'embedding ===
def get_embedding(sequence):
    spaced_seq = ' '.join(list(sequence.strip()))
    tokens = tokenizer(spaced_seq, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
    return embedding

# === Fichiers ===
input_file = r"C:\Users\REINA\BRCA2_Pathogenicity/data/processed/mutation/brca2_mutated_sequences.txt"
output_file = r"C:\Users\REINA\BRCA2_Pathogenicity/data/processed/embeddings/brca2_embeddings.txt"

# === Traitement du fichier markdown ===
variants, sequences, labels = [], [], []

with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        # Ignorer l'en-tête ou lignes vides
        if line.startswith("| Variant") or line.startswith("|---------") or not line:
            continue

        parts = [part.strip() for part in line.strip('|').split('|')]
        if len(parts) != 3:
            continue  # ligne incomplète

        variant, sequence, label = parts
        if sequence and label:
            variants.append(variant)
            sequences.append(sequence)
            labels.append(int(label))

# === Générer les embeddings et sauvegarder ===
with open(output_file, 'w', encoding='utf-8') as out_f:
    for i in range(len(sequences)):
        emb = get_embedding(sequences[i])
        emb_str = ' '.join(f"{x:.6f}" for x in emb)
        out_f.write(f"Variant: {variants[i]}\n")
        out_f.write(f"Label: {labels[i]}\n")
        out_f.write(f"Embedding: {emb_str}\n")
        out_f.write("=" * 80 + "\n\n")

print(f"✅ Embeddings saved to: {output_file}")


HYDROPLASTICITY

In [None]:
# === Hydropathie Kyte-Doolittle ===
hydropathy_dict = {
    'A': 1.8,  'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
    'L': 3.8,  'K': -3.9, 'M': 1.9,  'F': 2.8,  'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
}

# === Fonction de conversion d'une séquence ===
def encode_hydropathy(sequence, max_len=1024):
    scores = [hydropathy_dict.get(aa.upper(), 0.0) for aa in sequence.strip()]
    if len(scores) < max_len:
        scores += [0.0] * (max_len - len(scores))
    else:
        scores = scores[:max_len]
    return scores

# === Chemins ===
input_file = r"C:\Users\REINA\BRCA2_Pathogenicity/data/processed/mutation/brca2_mutated_sequences.txt"
output_file = r"C:\Users\REINA\BRCA2_Pathogenicity/data/processed/hydrophobicity/brca2_hydrophobicity.txt"

# === Lecture et traitement ===
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

variants = []
sequences = []
labels = []

for line in lines[1:]:  # skip header
    if line.strip() == "":
        continue
    parts = line.strip().split('|')
    if len(parts) >= 4:
        variant = parts[1].strip()
        sequence = parts[2].strip()
        label = parts[3].strip()
        variants.append(variant)
        sequences.append(sequence)
        labels.append(label)

# === Écriture du fichier de sortie ===
with open(output_file, 'w', encoding='utf-8') as out:
    for i in range(1,len(sequences)):
        hydro = encode_hydropathy(sequences[i])
        hydro_str = ' '.join([f"{x:.2f}" for x in hydro])
        out.write(f"Variant: {variants[i]}\n")
        out.write(f"Label: {labels[i]}\n")
        out.write(f"Hydropathy: {hydro_str}\n")
        out.write("="*80 + "\n\n")

print(f"✅ Hydropathic profiles saved to: {output_file}")


EMBEDDINGS+HYDROPLASTICITY

In [None]:
import pandas as pd
import numpy as np

def parse_file(file_path):
    """Parse either the embeddings or hydropathy file"""
    data = []
    current_variant = None
    current_label = None
    current_values = []
    
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith("Variant:"):
                if current_variant is not None:
                    data.append({
                        'Variant': current_variant,
                        'Label': current_label,
                        'Values': np.array(current_values, dtype=float)
                    })
                current_variant = line.split()[-1]
                current_values = []
            elif line.startswith("Label:"):
                current_label = int(line.split()[-1])
            elif line.startswith("Embedding:") or line.startswith("Hydropathy:"):
                values = line.split()[1:]
                current_values.extend([float(x) for x in values])
            elif line and not line.startswith("="):
                # Continuation of values
                current_values.extend([float(x) for x in line.split()])
    
    # Add the last entry
    if current_variant is not None:
        data.append({
            'Variant': current_variant,
            'Label': current_label,
            'Values': np.array(current_values, dtype=float)
        })
    
    return pd.DataFrame(data)

# File paths (using raw strings to handle Windows paths)
embedding_path = r"C:\Users\REINA\BRCA2_Pathogenicity/data/processed/embeddings/brca2_embeddings.txt"
hydropathy_path = r"C:\Users\REINA\BRCA2_Pathogenicity/data/processed/hydrophobicity/brca2_hydrophobicity.txt"

# Load the data
print("Loading embedding data...")
embeddings_df = parse_file(embedding_path)
print(f"Loaded {len(embeddings_df)} embedding records")

print("Loading hydropathy data...")
hydropathy_df = parse_file(hydropathy_path)
print(f"Loaded {len(hydropathy_df)} hydropathy records")

# Verify we have matching variants
if not all(embeddings_df['Variant'] == hydropathy_df['Variant']):
    print("Warning: Variant order doesn't match between files!")
    # If order doesn't match, we'll merge on variant name
    combined_df = pd.merge(
        embeddings_df, 
        hydropathy_df, 
        on=['Variant', 'Label'], 
        suffixes=('_embedding', '_hydropathy')
    )
else:
    combined_df = pd.DataFrame({
        'Variant': embeddings_df['Variant'],
        'Label': embeddings_df['Label'],
        'Values_embedding': embeddings_df['Values'],
        'Values_hydropathy': hydropathy_df['Values']
    })

print(f"Combined {len(combined_df)} records")

# Combine the features by concatenation
print("Combining features...")
combined_df['Combined_Features'] = combined_df.apply(
    lambda row: np.concatenate([row['Values_embedding'], row['Values_hydropathy']]),
    axis=1
)

# Create output directory if it doesn't exist
import os
output_dir = r"C:\Users\REINA\BRCA2_Pathogenicity/data/processed/combined"
os.makedirs(output_dir, exist_ok=True)

# Save to CSV - compact version
compact_path = os.path.join(output_dir, "brca2_combined_features.csv")
pd.DataFrame({
    'Variant': combined_df['Variant'],
    'Label': combined_df['Label'],
    'Combined_Features': combined_df['Combined_Features']
}).to_csv(compact_path, index=False)
print(f"Saved compact version to {compact_path}")

# Save expanded version with each feature as separate column
expanded_path = os.path.join(output_dir, "brca2_combined_features_expanded.csv")
features_list = [np.concatenate([e, h]) for e, h in zip(
    combined_df['Values_embedding'], 
    combined_df['Values_hydropathy']
)]

expanded_df = pd.DataFrame(features_list)
expanded_df.insert(0, 'Variant', combined_df['Variant'])
expanded_df.insert(1, 'Label', combined_df['Label'])
expanded_df.to_csv(expanded_path, index=False)
print(f"Saved expanded version to {expanded_path}")

print("Done!")