In [8]:
# !sudo apt-get install python3.7
# !sudo apt-get update -y
# !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
# !sudo update-alternatives --config python3

In [9]:
!pip install numpy
!pip install pyopenms
!pip install biopython

[0mCollecting pyopenms
  Downloading pyopenms-2.7.0-cp37-cp37m-manylinux2014_x86_64.whl (49.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.0/49.0 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pyopenms
Successfully installed pyopenms-2.7.0
[0mCollecting biopython
  Downloading biopython-1.81-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: biopython
Successfully installed biopython-1.81
[0m

In [303]:
import re

EXPERIMENT_NAME = "N4-1A"
# reading file
file1 = open(f'./content/{EXPERIMENT_NAME}/{EXPERIMENT_NAME}.txt', 'r')
oligo_list = [line for line in file1.readlines() if re.search("^[a-zA-Z][1-9]", line)]
import csv

header = ['where\tcode\tseq\tDMT\tcomment\tlength\t']
data = [row.strip() for row in oligo_list]


with open(f'./content/{EXPERIMENT_NAME}/{EXPERIMENT_NAME}_oligos.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)
    # write the header
    writer.writerow(header)

    writer = csv.writer(f, quotechar='\"', delimiter='\n', quoting=csv.QUOTE_NONE, escapechar='\\')
    # write the data
    writer.writerow(data)

In [304]:
import pandas as pd

df = pd.read_csv(f'./content/{EXPERIMENT_NAME}/{EXPERIMENT_NAME}_oligos.csv', delimiter='\t')
# drop NaN seq
df = df[df['seq'].notna()]
oligo_list = df['seq'].to_list()

In [305]:
from Bio.Seq import Seq
from Bio.SeqUtils import molecular_weight

# Bio.SeqUtils.molecular_weight(seq, seq_type='DNA', double_stranded=False, circular=False, monoisotopic=False)

from pyopenms import *

mass_list=[]
# https://biopython.org/docs/1.75/api/Bio.SeqUtils.html
for oligo_seq in oligo_list: 
    try:
        my_dna = Seq(oligo_seq)
    except:
        print(oligo_seq)
    try:
        print(f"{oligo_seq.strip()}\t" + "%0.2f" % (molecular_weight(my_dna)-79.97))
        mass_list+=["%0.2f" % (molecular_weight(my_dna)-79.97)]
    except ValueError:
        print(f"{oligo_seq.strip()}\t" + "'N' is not a valid unambiguous letter for DNA")
        mass_list+=["NaN"]
    
df['monoisotopic_mass']=mass_list
export_dataframe=df[['where', 'code', 'length', 'seq', 'monoisotopic_mass']]
export_dataframe.to_csv(f'./content/{EXPERIMENT_NAME}/{EXPERIMENT_NAME}_calculated.csv', sep = '\t', index=False)


ATC TGT ATC TTA AAG ACT ATC ATC AAG ACT CAC ATC AAG ACT ATC ATC AAG TCA TAC	17390.25
TAT ATC TTG ATG ATT ACC TAA AGA GGA GAT GAT TGA TGT ATG ACT TGA TGA TAG TCT TGA TG	19269.41
TCC TCT TTA GGT AAT CAT CAA GAT ATA CGG TAA GAC TAT CAT CAA GAC TAA CAT CAA GTC AAG ACC	20240.07
GCT TAT TGA TTG GCT TAT GGT CTA TCC TTT AGA TAG GTC TTG ACT TGA TGT TAG TCT	17572.26
GAC CAT AAG CCA ATC AAT AAG CCC ACA CTA TAA GGA GGA GAC AAA CAG AGA	15738.21
GAC TAT CTC TTA TAG TTC TCC TCT TAT AGT CTC TGT TTG TCT CCT CCT T	14812.50
ATA AGA GGA GAA CTA TAA GAG ATA GTC ACT AAC AAA CAA TTA TAA TAA GAA TAA T	17064.11
AAG ATA GTC TTT AAG TCT ATG ATT GTA ATA GTA TTA TTC TTA TTA TAA TTG TTT GTT AGT	18531.96
ATT ACA ATC ATA GAC TTA AAG ACT ATC TTA AAG ATT ATC TCA AAA TAC CTA TTG ACT AT	18060.71
CCG TCT CGA TGG AAT GAA TAA TAC ATA GAC CAG ACA CCA TAG TCA ATA GGT ATT TTG AGA TAA TCT	20351.13
TTA TTC ATT CCA TCG AGA CGG GAC ATA GAG ACC TTA CCC GAA AGG AAG T	15097.74
TGC TAC TTA ACG TTA TCA GAC TTA GTG ACT TCC TTT CGG GTA A

In [298]:
# reading file
df_calculated =  pd.read_csv(f'./content/{EXPERIMENT_NAME}/{EXPERIMENT_NAME}_calculated.csv', delimiter='\t', decimal=".")

# read concentrations
df = pd.read_csv(f'./content/{EXPERIMENT_NAME}/{EXPERIMENT_NAME}_concentrations.csv', delimiter='\t', decimal=".")

"""
strip the text everywhere on data frame
"""
cols = df.select_dtypes(object).columns
df[cols] = df[cols].apply(lambda x: x.str.strip())
"""
and again
"""
cols = df_calculated.select_dtypes(object).columns
df_calculated[cols] = df_calculated[cols].apply(lambda x: x.str.strip())

"""
now merge cleaned dataframes
"""
merge = pd.merge(df, df_calculated, on='where', how='inner', suffixes=('', '_drop'))
#Drop the duplicate columns
merge.drop([col for col in merge.columns if 'drop' in col], axis=1, inplace=True)

# reassign
df = merge

In [299]:
import sys
from Bio.Seq import Seq

# calculation of oligo extinction coefficients (mM^–1*cm^–1);
def e260_func(sequence):
    Coefficients = {
        "a": 15.4,
        "c": 7.4,
        "g": 11.5,
        "t": 8.7,
        "aa": 13.7,
        "ac": 10.6,
        "ag": 12.5,
        "at": 11.4,
        "ca": 10.6,
        "cc": 7.3,
        "cg": 9,
        "ct": 7.6,
        "ga": 12.6,
        "gc": 8.8,
        "gg": 10.8,
        "gt": 10,
        "ta": 11.7,
        "tc": 8.1,
        "tg": 9.5,
        "tt": 8.4,
    }
    
    Coefficients =  {key.upper(): val for key, val in Coefficients.items()}
    e260 = 0
    sequence = Seq(sequence.replace(" ", ""))
    
    for i in range(0, len(sequence) - 1):
        e260 = e260 + 2 * Coefficients[sequence[i] + sequence[i + 1]]
    for i in range(0, len(sequence) - 2):
        e260 = e260 - Coefficients[sequence[i + 1]]
    return e260

# CHANGE THIS LATER
Yield = 100

df["e260_mM^-1cm^-1"] = df["seq"].map(e260_func)
df["corrected_molar_concentration_pmol/mkl"] = (1000*2*df["A260"] / (df["e260_mM^-1cm^-1"] * (1 + (100/ Yield)))) ## (mmol/L == nmol/mkl)
df["corrected_weght_concentration"]=df["corrected_molar_concentration_pmol/mkl"]*df["monoisotopic_mass"]/1000 ## ng/mkl
df["corrected_Nucleic_Acid_Factor"]=(2/(df["e260_mM^-1cm^-1"] * (1 + (100/ Yield))))*df["monoisotopic_mass"]

df = df.round(2)

# write final merged result
df.to_csv(f'./content/{EXPERIMENT_NAME}/{EXPERIMENT_NAME}_merged_input.csv', encoding='UTF8')

In [300]:
df

Unnamed: 0,where,Date,Sample Name,Nucleic Acid(ng/uL),A260/A280,A260/A230,A260,A280,Nucleic Acid Factor,Baseline Correction (nm),Baseline Absorbance,code,length,seq,monoisotopic_mass,e260_mM^-1cm^-1,corrected_molar_concentration_pmol/mkl,corrected_weght_concentration,corrected_Nucleic_Acid_Factor
0,A7,06.12.2023 9:57,Sample 1,400.46,2.01,1.52,12.14,6.03,33.0,340,0.03,VibriophageN4_F3-1,#= 50,CTG ATA AAC CAC AAG ATA TCT ATG GTA TCG TTG CT...,15496.0,505.7,24.0,371.85,30.64
1,B7,06.12.2023 9:57,Sample 2,415.57,1.82,1.1,12.59,6.94,33.0,340,0.02,VibriophageN4_F3-2,#= 58,CGT CTT CTG TAC CGC TAA TCA CAA GGT CAC GTA GT...,17631.34,542.0,23.23,409.65,32.53
2,C7,06.12.2023 9:58,Sample 3,382.94,2.13,1.13,11.6,5.45,33.0,340,0.04,VibriophageN4_F3-3,#= 44,GAT TAG CGG TAC AGA AGA CGA GAT GCA AAC TCT CG...,13664.85,459.1,25.28,345.39,29.76
3,D7,06.12.2023 9:59,Sample 4,318.82,1.7,1.47,9.66,5.67,33.0,340,0.0,VibriophageN4_F3-4,#= 45,AGA CGT TCA GTG ATT TCA CCA GTC TTC TTG TCT TC...,13798.86,419.7,23.02,317.63,32.88
4,E7,06.12.2023 9:59,Sample 5,386.47,1.74,1.5,11.71,6.72,33.0,340,0.03,VibriophageN4_F3-5,#= 58,GGT GAA ATC ACT GAA CGT CTT GTG CTT GGT ACT CG...,17885.45,540.4,21.67,387.6,33.1
5,F7,06.12.2023 10:00,Sample 6,257.7,1.93,1.31,7.81,4.05,33.0,340,0.0,VibriophageN4_F3-6,#= 45,AGT TAC TGA GCG AGT CAC ACC ATA TTC TAA CCA TT...,13797.9,441.4,17.69,244.1,31.26
6,G7,06.12.2023 10:01,Sample 7,413.15,1.82,1.41,12.52,6.86,33.0,340,-0.0,VibriophageN4_F3-7,#= 54,TGT GAC TCG CTC AGT AAC TAA ACG CTC AGT TAT GA...,16561.64,509.1,24.59,407.29,32.53
7,H7,06.12.2023 10:02,Sample 8,424.99,1.96,1.25,12.88,6.55,33.0,340,0.0,VibriophageN4_F3-8,#= 49,CAA ACA CTT GGT CAG CAA AGC CAT ACT CTT TAG AG...,15026.7,481.0,26.78,402.35,31.24
8,I7,06.12.2023 10:02,Sample 9,469.35,1.91,1.41,14.22,7.44,33.0,340,0.48,VibriophageN4_F3-9,#= 54,CTT TGC TGA CCA AGT GTT TGA AGA TAC TGT GAT GC...,16659.72,522.8,27.21,453.23,31.87
9,J7,06.12.2023 10:03,Sample 10,305.86,1.75,1.37,9.27,5.29,33.0,340,-0.03,VibriophageN4_F3-10,#= 38,CTC AGT GAA CAT TGC ACC TTT GCC ATT GTC GAT AG...,11619.48,352.3,26.31,305.71,32.98


# final concentration dependency on yield:
![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)