In [1]:
### This notebook create a pandas dataframe for mutational consequences
import pandas as pd
import os
from os.path import join

In [2]:
def get_detail_mutation_df(BE, categ, data_path='../../data', sg_finder_path='../../data/sg_Finder'):
    """
    Extracts and groups mutation details based on a specified base editor (BE) and mutation category.

    Parameters:
    - BE (str): Base editor name (e.g., 'ABE', 'CBE').
    - categ (str): Mutation category to filter (e.g., 'synonymous', 'missense').
    - data_path (str): Path to the mutation data CSV files.
    - sg_finder_path (str): Path to the sgRNA Finder output.

    Returns:
    - pd.DataFrame: Grouped mutation details per sgRNA.
    """
    be = BE.lower()
    
    # Load mutation data
    df = pd.read_csv(join(data_path, f'complete_{BE}_df.csv'))
    
    # Filter based on mutation category
    df_filtered = df[df['categ'].isin(categ)]

    # Process sgRNAs (remove leading 'g' if present)
    df_filtered['sgRNA'] = df_filtered['sgRNA'].apply(lambda x: x[1:] if x.startswith('g') else x)
    
    # Initialize list to store data
    list_consequences = []
    # Iterate over genes and sgRNAs
    for gene, sgRNA in zip(df_filtered['gene'], df_filtered['sgRNA']):
        file_path = join(sg_finder_path, gene, 'ess_15', f'df_{be}_detail.csv')
        
        if not os.path.exists(file_path):
            print(f"Warning: File not found for gene {gene}, sgRNA {sgRNA}")
            continue  # Skip missing files
        
        # Load detailed mutation data
        mut_detail = pd.read_csv(file_path)
        mut_detail_sg = mut_detail[mut_detail['sgRNA'] == sgRNA]
        
        if not mut_detail_sg.empty:
            list_consequences.append(mut_detail_sg)

    # Concatenate all mutation data
    if not list_consequences:
        print("No valid mutation details found.")
        return pd.DataFrame()  # Return empty DataFrame if no valid data

    df_combined = pd.concat(list_consequences, axis=0)
    return(df_combined)

In [3]:
def save_group_detail(df_combined, data_path = '../../data', BE = None, categ = None):
    df_grouped = df_combined.groupby('sgRNA').agg({
    'gene': 'first',  # Assume gene is unique per sgRNA
    'transcript': 'first',
    'chrom': 'first',
    'edit_genome_pos': list,  # Store all edit positions
    'Synonymous': list,
    'Codon_Change': list,
    'AA_change': list,
    'AA_pos': list
}).reset_index()
    df_grouped.to_csv(join(data_path, f'{BE}_{categ}_detail.csv'))
    print(f'File saved to {BE}_{categ}_detail.csv')
    
def get_sg_detail(df_combined, 
                  BE = None, categ = None):
    df_grouped = df_combined.groupby('sgRNA').agg({
    'gene': 'first',  # Assume gene is unique per sgRNA
    'transcript': 'first',    
    'chrom': 'first',
    'edit_genome_pos': list,  # Store all edit positions
    'Synonymous': list,
    'sg_strand':'first',
    'gene_strand':'first'}).reset_index()
    return(df_grouped)

In [9]:
categ_ = ['SYN','MIS']
BE = 'ABE'

df_combined_ = get_detail_mutation_df(BE,categ_)
# save_group_detail(df_combined_, BE='ABE', categ = 'MIS')

df_grouped = get_sg_detail(df_combined_, BE = 'ABE', categ = categ_)
df = pd.read_csv(join(f'../../data/complete_{BE}_df.csv'))

# Filter based on mutation category
df_filtered = df[df['categ'].isin(categ_)]
df_filtered['sgRNA_'] = df_filtered['sgRNA'].apply(lambda x: x[1:] if x.startswith('g') else x)

crispresso_path = '../../data/sequencing/singleSg'
df_ = pd.concat([df_filtered[['name', 'sgRNA_','sgRNA','categ']].set_index('sgRNA_'), df_grouped.set_index('sgRNA')], axis = 1).reset_index()
df_[['sgRNA', 'gene', 'categ','name', 'edit_genome_pos', 'chrom', 'gene_strand','sg_strand']].reset_index().to_csv(join(crispresso_path, f'{BE}_detail.csv'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == "":


In [10]:
categ_ = ['SYN','MIS']
BE = 'CBE'

df_combined_ = get_detail_mutation_df(BE,categ_)
# save_group_detail(df_combined_, BE='ABE', categ = 'MIS')

df_grouped = get_sg_detail(df_combined_, BE = BE, categ = categ_)
df = pd.read_csv(join(f'../../data/complete_{BE}_df.csv'))

# Filter based on mutation category
df_filtered = df[df['categ'].isin(categ_)]
df_filtered['sgRNA_'] = df_filtered['sgRNA'].apply(lambda x: x[1:] if x.startswith('g') else x)

crispresso_path = '../../data/sequencing/singleSg'
df_ = pd.concat([df_filtered[['name', 'sgRNA_','sgRNA','categ']].set_index('sgRNA_'), df_grouped.set_index('sgRNA')], axis = 1).reset_index()
df_[['sgRNA', 'gene', 'categ','name', 'edit_genome_pos', 'chrom', 'gene_strand','sg_strand']].reset_index().to_csv(join(crispresso_path, f'{BE}_detail.csv'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == "":
