In [1]:
import pandas as pd
import numpy as np

In [2]:
### file df_ptrc.csv downloaded from doi: https://doi.org/10.1101/2020.06.25.170365

df = pd.read_csv('datasets/df_ptrc.csv')

In [4]:
df['Length'] = df['Sequence'].apply(lambda x: len(x))

In [10]:
# Drop promoters length != 74
df.drop(index=df[df['Length'] != 74].index, inplace=True)

In [13]:
df.drop(columns='Length', inplace=True)

In [14]:
df

Unnamed: 0,Name,Sequence,Fluorescence
0,ML2,TTGACAATTAATCATCCGGCTCGTATAATGTGCGGAATTGTAAGCG...,9790.19
1,ML3,TTGACAATTAATCATCCGGCTCGTATAATGTGTGGAATTGTGGGCG...,1770.43
2,ML4,TTGACAATTAATCATCCGGCTCGTATAATGTGTGGAATTGCGGGCG...,24277.73
3,ML5,TTGACAATTAATCACCCGGCTCGTATAATGTGTGGAGTTGTGAGCA...,12023.94
4,ML6,TTGACAATTAATCATCCGGCTCTTATAATGTGTGGAATTGTTATCT...,13260.34
...,...,...,...
3137,ML3139,TTGACAATTAATCATCCGGCTCGTATAATGCGTGGAATTGTGGGCG...,5476.28
3138,ML3140,TTGACAATTAATCATCCGGCTTGTATAGTGTGTGGAATTGTGGGCG...,4771.15
3139,ML3141,TTGACAATTAATCATCCGGCTCGTATAATGTTTGGAATTGTGAGCA...,8191.81
3140,ML3142,TTGACAATTAATCATCCGGCTCGTATAGTGTGTGGAATTGTGGGCG...,5752.45


# Scale fluorescence values

In [31]:
from sklearn.preprocessing import MinMaxScaler

In [32]:
scaler = MinMaxScaler(feature_range = (0,1))

In [33]:
fluorescence = df[['Fluorescence']]
fluorescence = scaler.fit_transform(fluorescence)
fluorescence
df['Expression scaled'] = fluorescence
df

Unnamed: 0,Name,Sequence,Fluorescence,Expression scaled
0,ML2,TTGACAATTAATCATCCGGCTCGTATAATGTGCGGAATTGTAAGCG...,9790.19,0.236253
1,ML3,TTGACAATTAATCATCCGGCTCGTATAATGTGTGGAATTGTGGGCG...,1770.43,0.040916
2,ML4,TTGACAATTAATCATCCGGCTCGTATAATGTGTGGAATTGCGGGCG...,24277.73,0.589126
3,ML5,TTGACAATTAATCACCCGGCTCGTATAATGTGTGGAGTTGTGAGCA...,12023.94,0.290660
4,ML6,TTGACAATTAATCATCCGGCTCTTATAATGTGTGGAATTGTTATCT...,13260.34,0.320775
...,...,...,...,...
3137,ML3139,TTGACAATTAATCATCCGGCTCGTATAATGCGTGGAATTGTGGGCG...,5476.28,0.131179
3138,ML3140,TTGACAATTAATCATCCGGCTTGTATAGTGTGTGGAATTGTGGGCG...,4771.15,0.114004
3139,ML3141,TTGACAATTAATCATCCGGCTCGTATAATGTTTGGAATTGTGAGCA...,8191.81,0.197321
3140,ML3142,TTGACAATTAATCATCCGGCTCGTATAGTGTGTGGAATTGTGGGCG...,5752.45,0.137906


# Categorize Strength

In [34]:
threshold_low = df['Expression scaled'].max()/3
threshold_high = df['Expression scaled'].max()*2/3

def categorize_expression(median, low, high):
    if median <= low:
        return '0'
    elif median <= high:
        return '1'
    else:
        return '2'

df['Strength'] = df['Expression scaled'].apply(categorize_expression, args=(threshold_low, threshold_high))

In [35]:
df

Unnamed: 0,Name,Sequence,Fluorescence,Expression scaled,Strength
0,ML2,TTGACAATTAATCATCCGGCTCGTATAATGTGCGGAATTGTAAGCG...,9790.19,0.236253,0
1,ML3,TTGACAATTAATCATCCGGCTCGTATAATGTGTGGAATTGTGGGCG...,1770.43,0.040916,0
2,ML4,TTGACAATTAATCATCCGGCTCGTATAATGTGTGGAATTGCGGGCG...,24277.73,0.589126,1
3,ML5,TTGACAATTAATCACCCGGCTCGTATAATGTGTGGAGTTGTGAGCA...,12023.94,0.290660,0
4,ML6,TTGACAATTAATCATCCGGCTCTTATAATGTGTGGAATTGTTATCT...,13260.34,0.320775,0
...,...,...,...,...,...
3137,ML3139,TTGACAATTAATCATCCGGCTCGTATAATGCGTGGAATTGTGGGCG...,5476.28,0.131179,0
3138,ML3140,TTGACAATTAATCATCCGGCTTGTATAGTGTGTGGAATTGTGGGCG...,4771.15,0.114004,0
3139,ML3141,TTGACAATTAATCATCCGGCTCGTATAATGTTTGGAATTGTGAGCA...,8191.81,0.197321,0
3140,ML3142,TTGACAATTAATCATCCGGCTCGTATAGTGTGTGGAATTGTGGGCG...,5752.45,0.137906,0


In [36]:
df.to_csv('datasets/df_ptrc_ML.csv', index=False)