In [2]:
import csv
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import os
import pickle

file_path = 'cacna1a_wt_2597v_gpu_residue.rounded.csv'
os.chdir('Cosine Branch')
!ls

cacna1a_wt_2597v_gpu_residue.rounded.csv  remote script.py  test
mutated_rows_0.pkl			  setup.sh


In [3]:
# Function to compute cosine similarity between two vectors
def compute_cosine_similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

# Function to parse the protein name and position from the row
def parse_protein_and_position(row):
    protein_name = row[-2]
    position_info = row[-1]
    amino_acid = position_info[0]  # First character is the amino acid
    position = int(position_info[1:])  # Rest of the string is the position
    return protein_name, amino_acid, position

# Function to extract the mutation status from the protein name
def extract_mutation_status(protein_name):
    if "|" in protein_name:
        status = protein_name.split('|')[-1]
        return 0 if status == 'nan' else 1
    return 0

# Function to write mutated_rows to a binary file
def write_to_file(mutated_rows, file_index):
    with open(f'mutated_rows_{file_index}.pkl', 'wb') as file:
        pickle.dump(mutated_rows, file)

def write_results_to_file(results, file_index):
    with open(f'result_chunk_{file_index}.pkl', 'wb') as file:
        pickle.dump(results, file)

# Function to read mutated_rows from a binary file
def read_from_file(file_index):
    with open(f'mutated_rows_{file_index}.pkl', 'rb') as file:
        return pickle.load(file)

# Process the CSV file line by line to avoid memory issues
def process_csv_file(file_path):
    wt_rows = []
    mutated_rows = []
    mutated_file_count = 0
    max_mutated_rows_size = 600000  # Threshold to save to a file

    # # Open the file and read line by line
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header row

        # Iterate over each row in the CSV file
        for row in reader:
            # Extract the protein name and position
            protein_name, amino_acid, position = parse_protein_and_position(row)

            # Convert the fractional columns to a numpy array (all columns except the last two)
            vector = np.array(row[:-2], dtype=float)

            # Extract mutation status
            mutation_status = extract_mutation_status(protein_name)

            # If it's a WT (wild-type) protein
            if protein_name.startswith('>WT|'):
                # Store the WT vector by position
                wt_rows.append({'AA': amino_acid, 'Pos': position, 'Vector': vector, 'Status': mutation_status})
            else:
                
                # Store the mutated row along with position and vector if it corresponds to the mutation
                match = re.search(r'>\w(\d+)(\w)\|', protein_name)
                if match:
                    mutant_name = f"{match.group(2)}{match.group(1)}"  # Extract the position and the new amino acid
                    
                    if mutant_name == row[-1]:
                        break
                        # mutated_rows.append({'AA': amino_acid, 'Pos': position, 'Vector': vector, 'Status': mutation_status})
                
    #             # Check if the length of mutated_rows exceeds the threshold
    #             if len(mutated_rows) >= max_mutated_rows_size:
    #                 # Write to a binary file and reset the mutated_rows list
    #                 write_to_file(mutated_rows, mutated_file_count)
    #                 mutated_file_count += 1
    #                 mutated_rows = []

    # # After the loop, save any remaining mutated_rows to a file
    # if mutated_rows:
    #     write_to_file(mutated_rows, mutated_file_count)
    #     mutated_file_count += 1

    # List to store results with cosine similarity
    results = []
    results_per_file = 500000
    results_file_count = 0
    result_counter = 0

    # Now process each WT row and compare it to all mutated rows
    for wt_row in wt_rows:
        wt_vector = wt_row['Vector']
        wt_position = wt_row['Pos']
        wt_amino_acid = wt_row['AA']
        wt_status = wt_row['Status']

        # Process each file of mutated rows
        for i in range(1):
            # Read mutated rows from file
            mutated_rows = read_from_file(i)

            # Compute cosine similarity for each WT row with each mutated row
            for mutated_row in mutated_rows:
                mutated_vector = mutated_row['Vector']
                mutated_amino_acid = mutated_row['AA']
                mutated_position = mutated_row['Pos']
                mutated_status = mutated_row['Status']

                # Compute cosine similarity between WT and mutated vector
                similarity = compute_cosine_similarity(wt_vector, mutated_vector)
                # Store results with WT position, similarity, WT amino acid, mutation amino acid, and status
                results.append([f"{wt_amino_acid}{wt_position}{mutated_amino_acid}", similarity, wt_status])
                result_counter += 1

                # Once we reach 500,000 results, save them to a file and reset
                if result_counter >= results_per_file:
                    write_results_to_file(results, results_file_count)
                    results_file_count += 1
                    results = []  # Clear results for the next batch
                    result_counter = 0

    # Save any remaining results after the loop
    if results:
        write_results_to_file(results, results_file_count)


    # Create a DataFrame to store the results
    result_df = pd.DataFrame(results, columns=['Mutant', 'CosineSim', 'Status'])

    return result_df

# process the embedding file
result_df = process_csv_file(file_path)

# Display the result
print(result_df)


        Mutant  CosineSim  Status
0       Q2314Y   0.088290       0
1       Q2314G   0.201110       0
2       Q2314Q   0.314634       0
3       Q2314R   0.069434       0
4       Q2314S   0.046752       0
...        ...        ...     ...
498053  C2506H   0.437661       0
498054  C2506N   0.422216       0
498055  C2506G   0.479091       0
498056  C2506S   0.703664       0
498057  C2506Y   0.771566       0

[498058 rows x 3 columns]


In [None]:
def read_from_file(file_index):
    with open(f'mutated_rows_{file_index}.pkl', 'rb') as file:
        return pickle.load(file)

In [4]:
mt_r = read_from_file(0)

[{'AA': 'S',
  'Pos': 2,
  'Vector': array([ 0.262, -0.026,  0.286, ...,  0.032,  0.141, -0.013]),
  'Status': 1},
 {'AA': 'V',
  'Pos': 2,
  'Vector': array([ 0.026,  0.104,  0.07 , ...,  0.057,  0.094, -0.154]),
  'Status': 1},
 {'AA': 'C',
  'Pos': 3,
  'Vector': array([ 0.135, -0.125,  0.127, ...,  0.093,  0.085, -0.165]),
  'Status': 1},
 {'AA': 'G',
  'Pos': 3,
  'Vector': array([ 0.1  , -0.071,  0.073, ...,  0.185,  0.119,  0.082]),
  'Status': 1},
 {'AA': 'V',
  'Pos': 5,
  'Vector': array([-0.014,  0.028,  0.108, ...,  0.023, -0.043, -0.063]),
  'Status': 0},
 {'AA': 'I',
  'Pos': 8,
  'Vector': array([0.068, 0.11 , 0.053, ..., 0.077, 0.037, 0.079]),
  'Status': 0},
 {'AA': 'L',
  'Pos': 9,
  'Vector': array([-0.182,  0.106, -0.036, ..., -0.037,  0.129,  0.072]),
  'Status': 1},
 {'AA': 'S',
  'Pos': 9,
  'Vector': array([ 0.022,  0.103,  0.201, ...,  0.059,  0.057, -0.002]),
  'Status': 0},
 {'AA': 'S',
  'Pos': 10,
  'Vector': array([ 0.114,  0.091,  0.199, ...,  0.007, -0.1

In [6]:
mt_r

[{'AA': 'S',
  'Pos': 2,
  'Vector': array([ 0.262, -0.026,  0.286, ...,  0.032,  0.141, -0.013]),
  'Status': 1},
 {'AA': 'V',
  'Pos': 2,
  'Vector': array([ 0.026,  0.104,  0.07 , ...,  0.057,  0.094, -0.154]),
  'Status': 1},
 {'AA': 'C',
  'Pos': 3,
  'Vector': array([ 0.135, -0.125,  0.127, ...,  0.093,  0.085, -0.165]),
  'Status': 1},
 {'AA': 'G',
  'Pos': 3,
  'Vector': array([ 0.1  , -0.071,  0.073, ...,  0.185,  0.119,  0.082]),
  'Status': 1},
 {'AA': 'V',
  'Pos': 5,
  'Vector': array([-0.014,  0.028,  0.108, ...,  0.023, -0.043, -0.063]),
  'Status': 0},
 {'AA': 'I',
  'Pos': 8,
  'Vector': array([0.068, 0.11 , 0.053, ..., 0.077, 0.037, 0.079]),
  'Status': 0},
 {'AA': 'L',
  'Pos': 9,
  'Vector': array([-0.182,  0.106, -0.036, ..., -0.037,  0.129,  0.072]),
  'Status': 1},
 {'AA': 'S',
  'Pos': 9,
  'Vector': array([ 0.022,  0.103,  0.201, ...,  0.059,  0.057, -0.002]),
  'Status': 0},
 {'AA': 'S',
  'Pos': 10,
  'Vector': array([ 0.114,  0.091,  0.199, ...,  0.007, -0.1