In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np

from pathlib import Path
import csv

data_path = Path('data')

valid_aminoacids = {
    "A", 
    "R",
    "N",
    "D",
    "C",
    "Q",
    "E",
    "G",
    "H",
    "I",
    "L",
    "K",
    "M",
    "F",
    "P",
    "S",
    "T",
    "W",
    "Y",
    "V"
}

In [14]:
def modify_sequence(sequence, mutation):
    """
    Modifies a sequence with a given mutation.
    
    The mutation format is `XiY` where `X` and `Y `corresponds to the original
    and the mutation aminoacid respectively, `i` corresponds to the location 
    of `X` in the original sequence (1 based indexing is expected for `i`).
    
    Example:
       sequence     mutation       mutated sequence
    'MASZGSADMAST'   'M1A'    ->    'AASZGSADMAST'
    """
    mutation = list(mutation)
    original, idx, variation = mutation[0], ''.join(mutation[1:-1]), mutation[-1]
    
    amino_is_valid = lambda x : x in valid_aminoacids
    if not (amino_is_valid(original) and amino_is_valid(variation)):
        raise ValueError(f"One of {original} or {variation} is not a valid aminoacid")
    
    return sequence[int(idx)-1:] + variation + sequence[:int(idx)]


def parse_lines(fname):
    """
    Parser for preprocessed data from: 
    ```Predicting and interpreting large scale mutagenesis data using
       analyses of protein stability and conservation```
    """
    clean_lines = []
    with open(fname) as f:
        
        lines = f.readlines()
        
        sequence = lines[5].split(':')[1].strip()
        columns = [col.strip() for col in lines[27].split(' ')]
        
        clean_lines.append(columns)
        content = lines[28:]
        
        for row_idx, row in enumerate(content):
            clean_row = []
            row = [val.strip() for val in row.split(' ') if val != '']
            mutation = row[0]
            try:
                new_seq = modify_sequence(sequence, mutation)
            except ValueError:
                continue
            
            clean_row.append(new_seq)
            for idx, val in enumerate(row[1:]):
                if val == '':
                    continue
                    
                elif val == 'NA':
                    val = np.nan
                    
                clean_row.append(val)
            
            clean_lines.append(clean_row)
    return clean_lines

In [15]:
lines = parse_lines(data_path/'preprocessed/prism_merged_002_PTEN_phosphatase_activity.txt')

def write_csv(out_path, stem, lines):
    """
    Writes the given list of lines (a list of lists) to 
    a .csv with the given stem in the out directory
    """
    with open(out_path / (stem + '.csv'), "w") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        for line in lines:
            writer.writerow(line)
    
def process_to_csv(to_process_path, out_dir = 'Protera'):
    """
    Processes for preprocessed data (.txt) from: 
    ```Predicting and interpreting large scale mutagenesis data using
       analyses of protein stability and conservation```
    """
    out_path = to_process_path.parent / out_dir
    out_path.mkdir(exist_ok=True)
        
    for file in tqdm(to_process_path.iterdir()):
        stem = file.stem
        
        lines = parse_lines(file)
        write_csv(out_path, stem, lines)

In [16]:
process_to_csv(data_path / "preprocessed")

39it [00:02, 16.90it/s]
