In [1]:
import csv
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import os
import pickle

file_path = '/Users/sakshmenon/Downloads/cacna1a_wt_2597v_gpu_residue.rounded.csv'
MAX_WT_LEN = 2506

# os.chdir('Cosine Branch')
!ls

Cosine_Index_All.ipynb         Local_Comp.ipynb
Cosine_Scores_Saksh_Menon.csv  Window_Vectors_Saksh_Menon.csv
Cosine_index_corr.ipynb        cosine_pipeline.ipynb


In [2]:
def parse_protein_and_position(row):
    protein_name = row[-2]
    position_info = row[-1]
    amino_acid = position_info[0]  # First character is the amino acid
    position = int(position_info[1:])  # Rest of the string is the position
    return protein_name, amino_acid, position

# Function to extract the mutation status from the protein name
def extract_mutation_status(protein_name):
    if "|" in protein_name:
        status = protein_name.split('|')[-1]
        return 0 if status == 'nan' else 1
    return 0

def compute_cosine_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]


In [3]:
def Wild_Type_Vectors():
    mt_flag = 0
    wt_rows = []
    mutated_rows = []
    mutated_file_count = 0
    max_mutated_rows_size = 600000  # Threshold to save to a file

    # Open the file and read line by line
    file =  open(file_path, 'r')
    reader = csv.reader(file)
    header = next(reader)  # Skip the header row

    # Iterate over each row in the CSV file
    for row in reader:
        # Extract the protein name and position
        protein_name, amino_acid, position = parse_protein_and_position(row)

        # Convert the fractional columns to a numpy array (all columns except the last two)
        vector = np.array(row[:-2], dtype=float)

        # Extract mutation status
        mutation_status = extract_mutation_status(protein_name)

        # If it's a WT (wild-type) protein
        if protein_name.startswith('>WT|'):
            # Store the WT vector by position
            wt_rows.append({'AA': amino_acid, 'Pos': position, 'Vector': vector, 'Status': mutation_status})
            if len(wt_rows) == MAX_WT_LEN:
                reader2 = csv.reader(file)
                return reader2, pd.DataFrame(wt_rows)

def Mutated_Type_Comp(reader, wt_df):
    c=0
    results = []
    with open('Cosine_Scores_Saksh_Menon.csv', 'w') as csv_file:
        csv_file.write('AA-Position-AA,Cosine Score,Label,Origin Mutation\n')
        for row in reader:
            c+=1
            protein_name, amino_acid, position = parse_protein_and_position(row)

            # Convert the fractional columns to a numpy array (all columns except the last two)
            vector = np.array(row[:-2], dtype=float)

            # Extract mutation status
            mutation_status = extract_mutation_status(protein_name)
            
            match = re.search(r'>\w(\d+)(\w)\|', protein_name)
            if match:
                mutant_name = f"{match.group(2)}{match.group(1)}"  # Extract the position and the new amino acid
                
                mutated_row = {'AA': amino_acid, 'Pos': position, 'Vector': vector, 'Status': mutation_status}

                position = mutated_row['Pos']
                mutated_vector = mutated_row['Vector']
                amino_acid = mutated_row['AA']
                status = mutated_row['Status']
                origin = re.findall(r'>\w\d+\w\|', protein_name)[0][1:-1]

                # Find the corresponding WT row by position
                wt_row = wt_df.loc[position - 1]

                if not(wt_row.empty):
                    wt_vector = wt_row['Vector']
                    wt_amino_acid = wt_row['AA']

                    # Compute cosine similarity between WT and mutated vector
                    similarity = compute_cosine_similarity(wt_vector, mutated_vector)
                    # Store results with position, similarity, WT amino acid, mutation amino acid, and status
                    # result = np.array([f"{wt_amino_acid}{position}{amino_acid}", similarity, status, origin], dtype = object)
                    # results.append(result)
                    csv_file.write(str(wt_amino_acid) + str(position) + str(amino_acid) + "," + str(similarity) + "," + str(status) + "," + str(origin)+"\n")

                
                # if c==10:
                #     return results

        # Create a DataFrame to store the results
        # result_df = pd.DataFrame(results, columns=['Mutant', 'CosineSim', 'Status'])


In [4]:
reader, wt_df = Wild_Type_Vectors()
res = Mutated_Type_Comp(reader, wt_df)

In [93]:
cosine_df = pd.read_csv("/Users/sakshmenon/Desktop/PLM CSV/Cosine_Scores_Saksh_Menon.csv")

In [94]:
import numpy as np
import pandas as pd



def generate_flank_vectors(df, window_size, flank=3):
    """
    Function to generate vectors of size (2 * flank + window_size) centered around every value in the DataFrame.
    Pads with 0s if the value is too close to the edges of the sequence.

    Args:
        df (pd.DataFrame): DataFrame containing only the values (one column expected).
                           Each value's index is treated as its position.
        window_size (int): The size of the central window around the mutation position.
        flank (int): Number of residues to retrieve on each side of the central window (default=3).

    Returns:
        list: A list of numpy arrays, where each array is a flanking vector centered around a value.
    """
    flanked_vectors = []
    file_name = '/Users/sakshmenon/Desktop/PLM CSV/Window_Vectors_Saksh_Menon_' + str(window_size) + '_.csv'

    with open(file_name, 'w') as window_csv:
        # mv = []

        header = ''
        for i in range(2*flank+window_size):
            header += 'v' + str(i+1) + ","
        header += 'Label,AA-Position-AA,Origin Mutation\n'

        window_csv.write(header)
    # Iterate through each index (treated as position) and value in the DataFrame
        for index, row in df.iterrows():
            vector = df.iloc[:, 1].values  # Get the full sequence as a numpy array
            position = index  # Treat the current index as the mutation position (0-based)

            # The central window spans `window_size`, and we add `flank` on both sides
            total_size = 2 * flank + window_size

            # Compute start and end indices for the flanking region (including window)
            start = max(0, position - (flank + int(window_size/2)))
            end = min(len(vector), position + int(window_size/2) + flank + 1)

            # Retrieve the region around the mutation, adjusting for window and flank
            flank_vector = vector[start:end]

            # Pad with 0s if necessary to ensure the vector is of size (2 * flank + window_size)
            if len(flank_vector) < total_size:
                pad_left = abs(start - (position - (flank + int(window_size/2))))  # Padding on the left if close to the start
                pad_right = abs(end - (position + flank + int(window_size/2) + 1))  # Padding on the right if close to the end
                flank_vector = np.pad(flank_vector, (pad_left, pad_right), 'constant', constant_values=0)

            flanked_vectors.append(flank_vector)

            flank_vector = np.array(flank_vector).tolist()
            flank_vector.append(row['Label'])
            
            csv_line = str(flank_vector)[1:-1]
            csv_line += ", " + row['AA-Position-AA'] + ", " + row['Origin Mutation'] + "\n"
            # flanked_vectors.append(flank_vector)
            window_csv.write(csv_line)
        
    return flanked_vectors

# Output the result
# for vec in flank_vectors:
#     print(vec)
# flank_vectors[0]

In [98]:
window_size = 7
flank_size = 3
flank_vectors = generate_flank_vectors(cosine_df, window_size, flank_size)