In [25]:
import csv
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import os
import pickle

file_path = 'cacna1a_wt_2597v_gpu_residue.rounded.csv'
# os.chdir('Cosine Branch')
!ls

3200.53s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


cacna1a_wt_2597v_gpu_residue.rounded.csv  setup.sh  test


In [27]:
# Function to compute cosine similarity between two vectors
def compute_cosine_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

# Function to parse the protein name and position from the row
def parse_protein_and_position(row):
    protein_name = row[-2]
    position_info = row[-1]
    amino_acid = position_info[0]  # First character is the amino acid
    position = int(position_info[1:])  # Rest of the string is the position
    return protein_name, amino_acid, position

# Function to extract the mutation status from the protein name
def extract_mutation_status(protein_name):
    if "|" in protein_name:
        status = protein_name.split('|')[-1]
        return 0 if status == 'nan' else 1
    return 0

# Function to write mutated_rows to a binary file
def write_to_file(mutated_rows, file_index):
    with open(f'mutated_rows_{file_index}.pkl', 'wb') as file:
        pickle.dump(mutated_rows, file)

# Function to read mutated_rows from a binary file
def read_from_file(file_index):
    with open(f'mutated_rows_{file_index}.pkl', 'rb') as file:
        return pickle.load(file)

# Process the CSV file line by line to avoid memory issues
def process_csv_file(file_path):
    wt_rows = []
    mutated_rows = []
    mutated_file_count = 0
    max_mutated_rows_size = 600000  # Threshold to save to a file

    # Open the file and read line by line
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header row

        # Iterate over each row in the CSV file
        for row in reader:
            # Extract the protein name and position
            protein_name, amino_acid, position = parse_protein_and_position(row)

            # Convert the fractional columns to a numpy array (all columns except the last two)
            vector = np.array(row[:-2], dtype=float)

            # Extract mutation status
            mutation_status = extract_mutation_status(protein_name)

            # If it's a WT (wild-type) protein
            if protein_name.startswith('>WT|'):
                # Store the WT vector by position
                wt_rows.append({'AA': amino_acid, 'Pos': position, 'Vector': vector, 'Status': mutation_status})
            else:
                # Store the mutated row along with position and vector if it corresponds to the mutation
                match = re.search(r'>\w(\d+)(\w)\|', protein_name)
                if match:
                    mutant_name = f"{match.group(2)}{match.group(1)}"  # Extract the position and the new amino acid
                    
                    if mutant_name == row[-1]:
                        mutated_rows.append({'AA': amino_acid, 'Pos': position, 'Vector': vector, 'Status': mutation_status})
                
                # Check if the length of mutated_rows exceeds the threshold
                if len(mutated_rows) >= max_mutated_rows_size:
                    # Write to a binary file and reset the mutated_rows list
                    write_to_file(mutated_rows, mutated_file_count)
                    mutated_file_count += 1
                    mutated_rows = []

    # After the loop, save any remaining mutated_rows to a file
    if mutated_rows:
        write_to_file(mutated_rows, mutated_file_count)
        mutated_file_count += 1

    # List to store results with cosine similarity
    results = []

    # Now process each file of mutated rows
    for i in range(mutated_file_count):
        # Read mutated rows from file
        mutated_rows = read_from_file(i)

        # Compare each mutated row to the WT rows
        for mutated_row in mutated_rows:
            position = mutated_row['Pos']
            mutated_vector = mutated_row['Vector']
            amino_acid = mutated_row['AA']
            status = mutated_row['Status']

            # Find the corresponding WT row by position
            wt_row = next((wt for wt in wt_rows if wt['Pos'] == position), None)

            if wt_row:
                wt_vector = wt_row['Vector']
                wt_amino_acid = wt_row['AA']

                # Compute cosine similarity between WT and mutated vector
                similarity = compute_cosine_similarity(wt_vector, mutated_vector)
                # Store results with position, similarity, WT amino acid, mutation amino acid, and status
                results.append([f"{wt_amino_acid}{position}{amino_acid}", similarity, status])

    # Create a DataFrame to store the results
    result_df = pd.DataFrame(results, columns=['Mutant', 'CosineSim', 'Status'])

    return result_df

# process the embedding file
result_df = process_csv_file(file_path)

# Display the result
print(result_df)


      Mutant  CosineSim  Status
0        A2S   0.830556       1
1        A2V   0.791399       1
2        R3C   0.625089       1
3        R3G   0.705138       1
4        G5V   0.630909       0
...      ...        ...     ...
2588  D2503H   0.629078       0
2589  D2503N   0.674396       0
2590  D2504G   0.743554       0
2591  C2506S   0.703664       0
2592  C2506Y   0.771566       0

[2593 rows x 3 columns]
