<a href="https://colab.research.google.com/github/rkarthikeyan06/scripps_internship_alab/blob/main/Issue_90_Test_Copy_Rithika.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Condensed Code

In [44]:
import pandas as pd
import re

gene_mapping = {
    "orf1a": "ORF1a",
    "orf1b": "ORF1b",
    "s": "S",
    "orf3a": "ORF3a",
    "e": "E",
    "m": "M",
    "orf6": "ORF6",
    "orf7a": "ORF7a",
    "orf7b": "ORF7b",
    "orf8": "ORF8",
    "n": "N",
    "orf10": "ORF10",
}

simulated_ES_data = pd.read_csv('scripps.csv.gz')

def lineage_mutations_handler_sim_modified(query_lineages, frequency, gene=None):
    if gene:
        genes = gene.lower().split(",")
    else:
        genes = []
    rows = []
    columns = set()
    mutation_info = {}

    for query_lineage in query_lineages:
        flattened_response = simulated_ES_data[simulated_ES_data['lineage'] == query_lineage]
        if len(flattened_response) > 0:
            df_response = flattened_response.assign(
                gene=lambda x: x["mutation"].apply(
                    lambda k: gene_mapping[k.split(":")[0]]
                    if k.split(":")[0] in gene_mapping
                    else k.split(":")[0]
                ),
                ref_aa=lambda x: x["mutation"]
                .apply(
                    lambda k: re.findall("[A-Za-z*]+", k.split(":")[1])[0]
                    if "DEL" not in k and "del" not in k and "_" not in k
                    else k
                )
                .str.upper(),
                alt_aa=lambda x: x["mutation"]
                .apply(
                    lambda k: re.findall("[A-Za-z*]+", k.split(":")[1])[1]
                    if "DEL" not in k and "del" not in k and "_" not in k
                    else k.split(":")[1]
                )
                .str.upper(),
                codon_num=lambda x: x["mutation"].apply(
                    lambda k: int(re.findall("[0-9]+", k.split(":")[1])[0])
                ),
                codon_end=lambda x: x["mutation"].apply(
                    lambda k: int(re.findall("[0-9]+", k.split(":")[1])[1])
                    if "/" in k and ("DEL" in k or "del" in k)
                    else None
                ),
                type=lambda x: x["mutation"].apply(
                    lambda k: "deletion" if "DEL" in k or "del" in k else "substitution"
                ),
            )
            df_response = df_response[df_response["ref_aa"] != df_response["alt_aa"]]
            df_response.loc[:, "prevalence"] = (
                df_response["mutation_count"] / df_response["lineage_count"]
            )
            df_response.loc[~df_response["codon_end"].isna(), "change_length_nt"] = (
                (df_response["codon_end"] - df_response["codon_num"]) + 1
            ) * 3
            df_response = df_response[df_response["prevalence"] >= frequency].fillna("None")
            if genes:
                df_response = df_response[df_response["gene"].str.lower().isin(genes)]

            for _, row in df_response.iterrows():
                lineage = row["lineage"]
                mutation = row["mutation"]
                mutation_count = row["mutation_count"]
                columns.add(mutation)
                if lineage not in mutation_info:
                    mutation_info[lineage] = {}
                mutation_info[lineage][mutation] = {
                    "ref_aa": row["ref_aa"],
                    "alt_aa": row["alt_aa"],
                    "codon_num": row["codon_num"],
                    "codon_end": row["codon_end"],
                    "type": row["type"],
                    "prevalence": row["prevalence"],
                    "change_length_nt": row["change_length_nt"],
                }

    column_names = ["lineage"] + list(columns)
    df_restructured = pd.DataFrame(rows, columns=column_names).fillna(0)
    resp = {"success": True, "results": df_restructured.to_dict(orient="records"), "mutation_info": mutation_info}
    return resp

#filter 75%
query_lineages = ['XBB.15', 'XBB.1.16', 'XBB.1.22.2']
frequency = 0.75
gene = None
result = lineage_mutations_handler_sim_modified(query_lineages, frequency, gene)
print(result)


{'success': True, 'results': [], 'mutation_info': {'XBB.1.16': {'E:T11A': {'ref_aa': 'T', 'alt_aa': 'A', 'codon_num': 11, 'codon_end': 'None', 'type': 'substitution', 'prevalence': 0.9992396079756675, 'change_length_nt': 'None'}, 'E:T9I': {'ref_aa': 'T', 'alt_aa': 'I', 'codon_num': 9, 'codon_end': 'None', 'type': 'substitution', 'prevalence': 0.9979722879351132, 'change_length_nt': 'None'}, 'M:A63T': {'ref_aa': 'A', 'alt_aa': 'T', 'codon_num': 63, 'codon_end': 'None', 'type': 'substitution', 'prevalence': 0.9741466711726935, 'change_length_nt': 'None'}, 'M:Q19E': {'ref_aa': 'Q', 'alt_aa': 'E', 'codon_num': 19, 'codon_end': 'None', 'type': 'substitution', 'prevalence': 0.933338965866847, 'change_length_nt': 'None'}, 'N:DEL31/33': {'ref_aa': 'N:DEL31/33', 'alt_aa': 'DEL31/33', 'codon_num': 31, 'codon_end': 33.0, 'type': 'deletion', 'prevalence': 0.7653768164920581, 'change_length_nt': 9.0}, 'N:G204R': {'ref_aa': 'G', 'alt_aa': 'R', 'codon_num': 204, 'codon_end': 'None', 'type': 'substitu

In [46]:
import pandas as pd

def lineage_mutations_handler_sim_modified(query_lineages, frequency, gene=None):
    simulated_ES_data = pd.read_csv('scripps.csv.gz')

    filtered_data = simulated_ES_data[simulated_ES_data['lineage'].isin(query_lineages)].copy()
    filtered_data = filtered_data[filtered_data['mutation_count'] / filtered_data['lineage_count'] >= frequency]

    if gene:
        genes = gene.lower().split(",")
        filtered_data = filtered_data[filtered_data['gene'].str.lower().isin(genes)]

    result = {"success": True, "results": []}

    return pd.DataFrame([{"lineage": lineage, **{row["mutation"]: row["mutation_count"] for _, row in filtered_data[filtered_data['lineage'] == lineage].iterrows()}} for lineage in query_lineages if lineage is not None]).set_index("lineage")


query_lineages = ['XBB.1.15', 'XBB.1.16', 'XBB.1.22.2']
frequency = 0.75
gene = None
result = lineage_mutations_handler_sim_modified(query_lineages, frequency, gene)
print(result)


            E:T11A  E:T9I  M:A63T  M:F112F  M:Q19E  N:DEL31/33  N:G204R  \
lineage                                                                   
XBB.1.15      2685   2692    2673     2668    2603        2618     2688   
XBB.1.16     11827  11812   11530    11519   11047        9059    11681   
XBB.1.22.2     379    379     379      375     371         373      376   

            N:G30G  N:P13L  N:R203K  ...  ORF1a:L3829F  ORF1b:D1746Y  \
lineage                              ...                               
XBB.1.15      7857    2676     5368  ...           NaN           NaN   
XBB.1.16     27271   11585    23489  ...       11727.0       11741.0   
XBB.1.22.2    1119     372      752  ...         376.0           NaN   

            ORF1b:V463V  S:E180V  S:F486P  S:T478R  ORF1a:R4017R  ORF1b:H590H  \
lineage                                                                         
XBB.1.15            NaN      NaN      NaN      NaN           NaN          NaN   
XBB.1.16        1170