In [1]:
import pandas as pd
import numpy as np

In [2]:
def generate_deltaprot_designs_data():

    no_disulfide_df =pd.read_csv("/home/tadas/code/single_chain_dp_bristol/selected_deltaprots/no_disulfide/no_disulfide_selected_deltaprots.csv")
    variable_linkers_df = pd.read_csv("/home/tadas/code/single_chain_dp_bristol/selected_deltaprots/variable_linkers/variable_linkers/variable_linkers_selected_deltaprots.csv")

    no_disulfide_df["name"]="no_disulfide_"+no_disulfide_df["orientation_code"]
    variable_linkers_df["name"]="variable_linkers_"+variable_linkers_df["orientation_code"]

    # merge the two piplene dataframes 
    df = pd.concat([no_disulfide_df, variable_linkers_df], axis=0)

    well_df = pd.read_csv("/home/tadas/code/single_chain_dp_bristol/order_optimised_codons_96_wp.csv")

    # merge well_df with df by Name and sort by Well Position
    df = pd.merge(well_df, df, how="left", left_on="Name", right_on="name")

    # MGSSHHHHHHSSGENLYFQSGS addition adds some mw
    df["mass_w_prefix"] = df["mass"] + 2460.54
    df.to_csv("/home/tadas/code/single_chain_dp_bristol/deltaprot_designs_data.csv", index=False)

def load_deltaprot_designs_data():
    return pd.read_csv("/home/tadas/code/single_chain_dp_bristol/deltaprot_designs_data.csv")

In [3]:
# generate_deltaprot_designs_data()
df = load_deltaprot_designs_data()


In [21]:

# populate received_from_idt column. Received everything except for C2,C4,D5,D6,F2,F3,F6
from networkx import volume


not_received = ['C2', 'C4', 'D5', 'D6', 'F2', 'F3', 'F6']
df['received_from_idt'] = ~df["Well Position"].isin(not_received)

# populate transformation_attempted for (A1-A12,B1-B12,C1,C3,F4,C5,C6)
transformation_attempted_ids = set([f"A{i}" for i in range(1, 13)] +
                    [f"B{i}" for i in range(1, 13)] +
                    ['C1', 'C3', 'F4', 'C5', 'C6'])
df['transformation_attempted'] = df["Well Position"].isin(transformation_attempted_ids)
df['transformation_successful'] = df["Well Position"].apply(
    lambda x: (False if x == 'C6' else True) if x in transformation_attempted_ids else np.nan
)

# populate tranformation_successful for transformation_attempted only (otherwise none), but false for C6
df['transformation_successful'] = df["Well Position"].apply(
    lambda x: (False if x == 'C6' else True) if x in transformation_attempted_ids else np.nan
)

# populate expression_levels: (None, "low","medium","high") low: A1,A3,A6,A7,A8,A10,B1. Medium: A5,A11,B2,B3. High: A9,A12
expr_map = {
    **dict.fromkeys(['A1','A3', 'A6', 'A8', 'B1','B4', 'B10','C3'], 'low'),
    **dict.fromkeys(['A7', 'A10','B12','C1','F4'], 'medium'),
    **dict.fromkeys(['A11', 'B2', 'B3','A5'], 'high'),
    **dict.fromkeys(['A9', 'A12', 'B8'], 'super_high')
}
df['expression_levels'] = df["Well Position"].map(expr_map)

# populate sds_page_size_appearance for every visible band?
# TODO

# populate large_scale_expressed for A12,A9,B8,A1,B12,F4
scaled_expressed_ids = ['A12','A9','B8','A1','B12','F4']
df['large_scale_expressed'] = df["Well Position"].isin(scaled_expressed_ids)

# populate scaled_purified for A12,A9,B8 TODO add ,'A1','B12','F4'
# scaled_purified_ids = ['A12','A9','B8'] \

A280_after_NI_NTA_IMAC = {
    "A12": 13.41,
    "A9": 6.2,
    "B8": 16.11
}
volume_after_NI_NTA_IMAC = 3.0

df['A280_after_NI_NTA_IMAC'] = df["Well Position"].map(A280_after_NI_NTA_IMAC)
# calculate molar conc asuming sample is entirely my protein df.sequence_molar_extinction_280

df['molar_conc_after_NI_NTA_IMAC'] = df['A280_after_NI_NTA_IMAC'] / df['sequence_molar_extinction_280']
df['mg_per_ml_after_NI_NTA_IMAC'] = df['molar_conc_after_NI_NTA_IMAC'] * df['mass_w_prefix']









In [22]:
# save df to csv
df.to_csv("/home/tadas/code/single_chain_dp_bristol/experimental_results/deltaprot_designs_data_with_results.csv", index=False)

In [6]:
df[["Well Position","Name","mass","mass_w_prefix","charge"]]

Unnamed: 0,Well Position,Name,mass,mass_w_prefix,charge
0,A1,no_disulfide_b3iii,7218.91718,9679.45718,-7.195539
1,A2,no_disulfide_b3nnn,6293.89118,8754.43118,-3.196935
2,A3,no_disulfide_b4iiiix,9399.48898,11860.02898,-8.306272
3,A4,no_disulfide_b4iiiiy,7503.21198,9963.75198,-7.429897
4,A5,no_disulfide_b4iiin,6905.64008,9366.18008,-9.196622
5,A6,no_disulfide_b4inin,9720.64698,12181.18698,-20.303951
6,A7,no_disulfide_b4innn,8302.30618,10762.84618,-8.256534
7,A8,no_disulfide_b4nnnnx,8860.65328,11321.19328,-12.148349
8,A9,no_disulfide_b4nnnny,8651.77758,11112.31758,-7.202815
9,A10,no_disulfide_b5iiiin,9471.93758,11932.47758,-4.200798


In [35]:
# mass correct: A9, A11, B2, C1, F4, B12
# mass questionable: A12 (has 2 bands), B3 (2 bands)
# mass wrong: 

In [36]:
# unlimit pandas column display number
pd.set_option('display.max_columns', None)

In [37]:
df[df["expression_levels"].isin(["low","medium","high","super_high"])].sort_values("expression_levels")[["Well Position","Name","mass_w_prefix","expression_levels"]]

Unnamed: 0,Well Position,Name,mass_w_prefix,expression_levels
4,A5,no_disulfide_b4iiin,9366.18008,high
10,A11,no_disulfide_b5iinin,14258.82088,high
13,B2,no_disulfide_b6iiniin,15382.08778,high
14,B3,no_disulfide_b6ininin,15709.22178,high
0,A1,no_disulfide_b3iii,9679.45718,low
2,A3,no_disulfide_b4iiiix,11860.02898,low
5,A6,no_disulfide_b4inin,12181.18698,low
7,A8,no_disulfide_b4nnnnx,11321.19328,low
21,B10,no_disulfide_l4iin,10978.09618,low
26,C3,no_disulfide_l5niin,12425.67008,low


In [None]:
# Choices for large scale expression

# A12	no_disulfide_b5ininn super_high
# A9	no_disulfide_b4nnnny
# B8	no_disulfide_h6i_i_i

# A1 no_disulfide_b3iii (low-medium)
# B12	no_disulfide_l5iiin (medium)
# F4	variable_linkers_l6innni	(medium)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'df' is your DataFrame and it includes a categorical 'expression_levels'
# Identify all numeric columns in the dataframe
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# 1. Box Plots for each numeric feature
for col in numeric_columns:
    print(col)
    plt.figure(figsize=(4,3))
    sns.boxplot(x='expression_levels', y=col, data=df)
    sns.swarmplot(x='expression_levels', y=col, data=df)
    plt.title(f'Box Plot of {col} by Expression Levels')
    plt.show()

# # 2. Violin Plots for each numeric feature
# for col in numeric_columns:
#     plt.figure(figsize=(6,4))
#     sns.violinplot(x='expression_levels', y=col, data=df)
#     plt.title(f'Violin Plot of {col} by Expression Levels')
#     plt.show()

# # 3. Scatter Plot Matrix (Pairplot)
# sns.pairplot(df, hue='expression_levels', vars=numeric_columns)
# plt.show()

# # 4. Faceted Histograms using seaborn's FacetGrid for one numeric feature example:
# g = sns.FacetGrid(df, col="expression_levels", col_wrap=4, height=3)
# g.map(plt.hist, numeric_columns[0], bins=20)
# plt.show()
