In [1]:
!hostname


rubensolozabal-MSI


In [2]:
import os
os.chdir('/home/solozabal/Documents/projects/TFBind')

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('data/ETS1/Mixed_MMvsPos.csv', sep=',', header=0)

In [5]:
# Rename Unamed: 0 to Change
df = df.rename(columns={"Unnamed: 0": "Change"})
df

Unnamed: 0,Change,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15
0,AA,7.320168,7.246614,7.260331,6.110860,5.445802,5.244965,5.516034,5.644994,5.747856,5.482331,6.830571,7.235255,7.353515
1,AT,7.245554,7.267127,7.510871,6.564748,6.661495,5.282475,5.217410,7.123130,7.123130,7.123130,6.322838,7.160409,7.241961
2,AC,7.301259,7.222632,7.605705,6.423103,6.314480,5.248425,5.266997,5.308990,5.621092,5.921250,6.066382,7.269268,7.288817
3,AG,7.132470,7.099841,7.248033,5.950233,5.688981,5.233870,5.219833,5.441993,5.408496,5.536062,6.426919,7.105429,7.262180
4,AX,7.018844,7.383503,8.535444,6.032799,5.788443,5.488724,5.254884,5.457271,5.806385,6.278308,5.950907,7.133228,7.068826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,MD,,,,,,,,,,,,,
117,MI,7.107766,7.196165,6.981429,5.772924,5.413027,5.192821,5.143062,5.453927,5.567918,5.382820,6.840487,7.153757,7.250999
118,Mg,6.986272,6.847157,6.652555,5.836638,5.442051,5.282475,5.164816,5.488788,5.226042,5.332840,5.757418,6.969087,7.249313
119,Ma,7.372220,7.037023,6.731179,5.761441,5.331933,5.118652,5.414219,5.555172,5.520548,5.554992,6.516235,7.280682,7.426153


In [6]:
# Unpivot position columns so each numeric value becomes a row, keeping Change and Position
pos_cols = [c for c in df.columns if c.startswith('p') and c[1:].isdigit()]

df_long = (
    df.melt(id_vars='Change', value_vars=pos_cols, var_name='Position', value_name='ln(I)')
      .assign(Position=lambda x: x['Position'].str.extract(r'(\d+)').astype(int))
      .dropna(subset=['ln(I)'])
      .sort_values(['Change', 'Position'])
      .reset_index(drop=True)
)

df_long

Unnamed: 0,Change,Position,ln(I)
0,AA,3,7.320168
1,AA,4,7.246614
2,AA,5,7.260331
3,AA,6,6.110860
4,AA,7,5.445802
...,...,...,...
1256,gg,11,5.207028
1257,gg,12,5.394492
1258,gg,13,5.534682
1259,gg,14,6.930534


In [7]:
# Create a column copying ETS1_PLUS_STRAND and 

from src.constants import *

df_long['Plus_strand'] = ETS1_PLUS_STRAND
df_long['Minus_strand'] = ETS1_MINUS_STRAND


In [8]:
df_long

Unnamed: 0,Change,Position,ln(I),Plus_strand,Minus_strand
0,AA,3,7.320168,TAGTGCCGGAAATGT,ACATTTCCGGCACTA
1,AA,4,7.246614,TAGTGCCGGAAATGT,ACATTTCCGGCACTA
2,AA,5,7.260331,TAGTGCCGGAAATGT,ACATTTCCGGCACTA
3,AA,6,6.110860,TAGTGCCGGAAATGT,ACATTTCCGGCACTA
4,AA,7,5.445802,TAGTGCCGGAAATGT,ACATTTCCGGCACTA
...,...,...,...,...,...
1256,gg,11,5.207028,TAGTGCCGGAAATGT,ACATTTCCGGCACTA
1257,gg,12,5.394492,TAGTGCCGGAAATGT,ACATTTCCGGCACTA
1258,gg,13,5.534682,TAGTGCCGGAAATGT,ACATTTCCGGCACTA
1259,gg,14,6.930534,TAGTGCCGGAAATGT,ACATTTCCGGCACTA


In [9]:
assert ETS1_LEN == df_long['Position'].max()

In [10]:
from src.utils import *
from src.constants import ETS1_PLUS_STRAND, ETS1_MINUS_STRAND

# Assuming df_long has columns Change/Position
df_long[["Plus_strand", "Minus_strand"]] = df_long.apply(
    lambda row: apply_strand_mutation(
        change=row["Change"],
        position=int(row["Position"]),
        plus_strand=ETS1_PLUS_STRAND,
        minus_strand=ETS1_MINUS_STRAND,
    ),
    axis=1,
    result_type="expand",
)


In [11]:
df_long

Unnamed: 0,Change,Position,ln(I),Plus_strand,Minus_strand
0,AA,3,7.320168,TAATGCCGGAAATGT,ACATTTCCGGCAATA
1,AA,4,7.246614,TAGAGCCGGAAATGT,ACATTTCCGGCACTA
2,AA,5,7.260331,TAGTACCGGAAATGT,ACATTTCCGGAACTA
3,AA,6,6.110860,TAGTGACGGAAATGT,ACATTTCCGACACTA
4,AA,7,5.445802,TAGTGCAGGAAATGT,ACATTTCCAGCACTA
...,...,...,...,...,...
1256,gg,11,5.207028,TAGTGCCGGAgATGT,ACATgTCCGGCACTA
1257,gg,12,5.394492,TAGTGCCGGAAgTGT,ACAgTTCCGGCACTA
1258,gg,13,5.534682,TAGTGCCGGAAAgGT,ACgTTTCCGGCACTA
1259,gg,14,6.930534,TAGTGCCGGAAATgT,AgATTTCCGGCACTA


In [12]:
from src.encode import groove_encoding_for_strands

# Apply row-wise to be compatible with different return types (tuple/list/dict/array)
_encoded = df_long.apply(
    lambda r: groove_encoding_for_strands(r["Plus_strand"], r["Minus_strand"]),
    axis=1
)

# Expand results into columns
encoded_df = _encoded.apply(pd.Series)

# Add a helpful prefix if columns are not already named strings
if not all(isinstance(c, str) for c in encoded_df.columns):
    encoded_df.columns = [f"Groove_major", f"Groove_minor"]

# Concatenate with df_long without overwriting existing columns
df_long = pd.concat([df_long, encoded_df], axis=1)

df_long

Unnamed: 0,Change,Position,ln(I),Plus_strand,Minus_strand,Groove_major,Groove_minor
0,AA,3,7.320168,TAATGCCGGAAATGT,ACATTTCCGGCAATA,"[[M, A, D, A], [A, D, A, M], [A, D, D, A], [M,...","[[x, A, n, A], [A, n, A, x], [A, n, n, A], [x,..."
1,AA,4,7.246614,TAGAGCCGGAAATGT,ACATTTCCGGCACTA,"[[M, A, D, A], [A, D, A, M], [A, A, D, n], [A,...","[[x, A, n, A], [A, n, A, x], [A, D, A, x], [A,..."
2,AA,5,7.260331,TAGTACCGGAAATGT,ACATTTCCGGAACTA,"[[M, A, D, A], [A, D, A, M], [A, A, D, n], [M,...","[[x, A, n, A], [A, n, A, x], [A, D, A, x], [x,..."
3,AA,6,6.110860,TAGTGACGGAAATGT,ACATTTCCGACACTA,"[[M, A, D, A], [A, D, A, M], [A, A, D, n], [M,...","[[x, A, n, A], [A, n, A, x], [A, D, A, x], [x,..."
4,AA,7,5.445802,TAGTGCAGGAAATGT,ACATTTCCAGCACTA,"[[M, A, D, A], [A, D, A, M], [A, A, D, n], [M,...","[[x, A, n, A], [A, n, A, x], [A, D, A, x], [x,..."
...,...,...,...,...,...,...,...
1256,gg,11,5.207028,TAGTGCCGGAgATGT,ACATgTCCGGCACTA,"[[M, A, D, A], [A, D, A, M], [A, A, D, n], [M,...","[[x, A, n, A], [A, n, A, x], [A, D, A, x], [x,..."
1257,gg,12,5.394492,TAGTGCCGGAAgTGT,ACAgTTCCGGCACTA,"[[M, A, D, A], [A, D, A, M], [A, A, D, n], [M,...","[[x, A, n, A], [A, n, A, x], [A, D, A, x], [x,..."
1258,gg,13,5.534682,TAGTGCCGGAAAgGT,ACgTTTCCGGCACTA,"[[M, A, D, A], [A, D, A, M], [A, A, D, n], [M,...","[[x, A, n, A], [A, n, A, x], [A, D, A, x], [x,..."
1259,gg,14,6.930534,TAGTGCCGGAAATgT,AgATTTCCGGCACTA,"[[M, A, D, A], [A, D, A, M], [A, A, D, n], [M,...","[[x, A, n, A], [A, n, A, x], [A, D, A, x], [x,..."


In [13]:
# Save as CSV
df_long.to_csv('datasets/ETS1/dataset_ETS1_encoded.csv', index=False)