In [18]:
!hostname


rubensolozabal-MSI


In [19]:
TF = 'MITF'  # Options: 'ETS1', 'EGR1', 'MITF'

In [20]:
import os
os.chdir('/home/solozabal/Documents/projects/TFBind')

In [21]:
import numpy as np
import pandas as pd

In [22]:

if TF == 'ETS1':
    df = pd.read_csv(f'data/ETS1/_ETS1_sMM_vs_positions_mixedLCS340+312-wt.csv', sep=',', header=0)
elif TF == 'EGR1':
    df = pd.read_csv(f'data/EGR1/_EGR1_sMM_vs_positions_MIXED.csv', sep=',', header=0)
elif TF == 'MITF':
    df = pd.read_csv(f'data/MITF/_MITFCbox_sMM_vs_positions_mixedWithLCS342-wt.csv', sep=',', header=0)
else:
    raise ValueError(f'Unknown TF: {TF}')

In [23]:
# Rename Unamed: 0 to Change
df = df.rename(columns={"Unnamed: 0": "Change"})
df

Unnamed: 0,Change,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14
0,AA,-0.312518,-0.021256,-0.528420,-1.084303,-2.494835,-1.470603,-1.976624,-2.232924,-0.914522,-2.568939,-1.315451,-0.514203,,
1,AT,-0.131915,-0.066268,0.000000,-0.564390,-1.928063,0.000000,-2.410440,-0.909013,-1.064097,-2.369806,0.000000,-0.406671,0.000000,
2,AC,-0.103394,0.171252,-0.422799,-1.325604,-2.414722,-1.685120,-1.823903,-0.711652,-1.655549,-1.275404,-1.084032,-0.634665,,
3,AG,-0.481488,-0.126255,-0.536110,-1.221476,-1.468976,-1.087968,-0.639303,-1.325249,-1.540374,-2.421636,-0.864695,-0.288864,,
4,AI,-0.290630,-0.391536,-0.427689,-0.985705,-1.856286,-1.395494,-0.650922,-1.412666,-1.704069,-2.535771,-0.616478,-0.598082,-0.366882,-0.364517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,DC,-0.115807,-0.125853,-0.610985,-1.549218,-2.422508,-2.225497,-2.533324,-1.318340,-2.269435,-0.932607,-1.339115,-0.222989,,
93,DG,-0.459136,-0.471691,-1.119472,-1.272415,-1.176521,-1.684228,-1.266972,-2.500259,-2.027683,-2.147292,-0.865264,-0.195758,,
94,DX,-0.118734,-0.084657,-0.769156,-1.560458,-2.338843,-1.635526,-2.600460,-2.000536,-2.144960,-2.351759,-0.762346,-0.510166,,
95,DU,-0.064557,-0.161180,-0.808967,-1.451776,-2.070315,-1.650201,-2.390633,-2.335327,-2.540474,-2.090574,-0.857123,-0.705233,,


In [24]:
# Unpivot position columns so each numeric value becomes a row, keeping Change and Position
pos_cols = [c for c in df.columns if c.startswith('p') and c[1:].isdigit()]

df_long = (
    df.melt(id_vars='Change', value_vars=pos_cols, var_name='Position', value_name='ln(I)')
      .assign(Position=lambda x: x['Position'].str.extract(r'(\d+)').astype(int))
      .dropna(subset=['ln(I)'])
      .sort_values(['Change', 'Position'])
      .reset_index(drop=True)
)

df_long

Unnamed: 0,Change,Position,ln(I)
0,AA,1,-0.312518
1,AA,2,-0.021256
2,AA,3,-0.528420
3,AA,4,-1.084303
4,AA,5,-2.494835
...,...,...,...
1257,gg,10,-1.605242
1258,gg,11,-1.391758
1259,gg,12,-0.879398
1260,gg,13,-0.252286


In [25]:
# Create a column copying ETS1_PLUS_STRAND and 

from src.constants import *

if TF == 'ETS1':
    PLUS_STRAND = ETS1_PLUS_STRAND
    MINUS_STRAND = ETS1_MINUS_STRAND
    LEN = ETS1_LEN
elif TF == 'EGR1':
    PLUS_STRAND = EGR1_PLUS_STRAND
    MINUS_STRAND = EGR1_MINUS_STRAND
    LEN = EGR1_LEN
elif TF == 'MITF':
    PLUS_STRAND = MITF_PLUS_STRAND
    LEN = MITF_LEN
    MINUS_STRAND = MITF_MINUS_STRAND

df_long['Plus_strand'] = PLUS_STRAND
df_long['Minus_strand'] = MINUS_STRAND


In [26]:
df_long

Unnamed: 0,Change,Position,ln(I),Plus_strand,Minus_strand
0,AA,1,-0.312518,GTATCACGTGATAC,GTATCACGTGATAC
1,AA,2,-0.021256,GTATCACGTGATAC,GTATCACGTGATAC
2,AA,3,-0.528420,GTATCACGTGATAC,GTATCACGTGATAC
3,AA,4,-1.084303,GTATCACGTGATAC,GTATCACGTGATAC
4,AA,5,-2.494835,GTATCACGTGATAC,GTATCACGTGATAC
...,...,...,...,...,...
1257,gg,10,-1.605242,GTATCACGTGATAC,GTATCACGTGATAC
1258,gg,11,-1.391758,GTATCACGTGATAC,GTATCACGTGATAC
1259,gg,12,-0.879398,GTATCACGTGATAC,GTATCACGTGATAC
1260,gg,13,-0.252286,GTATCACGTGATAC,GTATCACGTGATAC


In [27]:
assert LEN == df_long['Position'].max()

In [30]:
from src.utils import *
from src.constants import *

# Assuming df_long has columns Change/Position
df_long[["Plus_strand", "Minus_strand"]] = df_long.apply(
    lambda row: apply_strand_mutation(
        change=row["Change"],
        position=int(row["Position"]),
        plus_strand=PLUS_STRAND,
        minus_strand=MINUS_STRAND,
    ),
    axis=1,
    result_type="expand",
)


In [31]:
df_long

Unnamed: 0,Change,Position,ln(I),Plus_strand,Minus_strand
0,AA,1,-0.312518,ATATCACGTGATAC,GTATCACGTGATAA
1,AA,2,-0.021256,GAATCACGTGATAC,GTATCACGTGATAC
2,AA,3,-0.528420,GTATCACGTGATAC,GTATCACGTGAAAC
3,AA,4,-1.084303,GTAACACGTGATAC,GTATCACGTGATAC
4,AA,5,-2.494835,GTATAACGTGATAC,GTATCACGTAATAC
...,...,...,...,...,...
1257,gg,10,-1.605242,GTATCACGTgATAC,GTATgACGTGATAC
1258,gg,11,-1.391758,GTATCACGTGgTAC,GTAgCACGTGATAC
1259,gg,12,-0.879398,GTATCACGTGAgAC,GTgTCACGTGATAC
1260,gg,13,-0.252286,GTATCACGTGATgC,GgATCACGTGATAC


In [32]:
from src.encode import groove_encoding_for_strands

# Apply row-wise to be compatible with different return types (tuple/list/dict/array)
_encoded = df_long.apply(
    lambda r: groove_encoding_for_strands(r["Plus_strand"], r["Minus_strand"]),
    axis=1
)

# Expand results into columns
encoded_df = _encoded.apply(pd.Series)

# Add a helpful prefix if columns are not already named strings
if not all(isinstance(c, str) for c in encoded_df.columns):
    encoded_df.columns = [f"Groove_major", f"Groove_minor"]

# Concatenate with df_long without overwriting existing columns
df_long = pd.concat([df_long, encoded_df], axis=1)

df_long

Unnamed: 0,Change,Position,ln(I),Plus_strand,Minus_strand,Groove_major,Groove_minor
0,AA,1,-0.312518,ATATCACGTGATAC,GTATCACGTGATAA,"[[A, D, D, A], [M, A, D, A], [A, D, A, M], [M,...","[[A, n, n, A], [x, A, n, A], [A, n, A, x], [x,..."
1,AA,2,-0.021256,GAATCACGTGATAC,GTATCACGTGATAC,"[[A, A, D, n], [A, D, D, A], [A, D, A, M], [M,...","[[A, D, A, x], [A, n, n, A], [A, n, A, x], [x,..."
2,AA,3,-0.528420,GTATCACGTGATAC,GTATCACGTGAAAC,"[[A, A, D, n], [M, A, D, A], [A, D, D, A], [M,...","[[A, D, A, x], [x, A, n, A], [A, n, n, A], [x,..."
3,AA,4,-1.084303,GTAACACGTGATAC,GTATCACGTGATAC,"[[A, A, D, n], [M, A, D, A], [A, D, A, M], [A,...","[[A, D, A, x], [x, A, n, A], [A, n, A, x], [A,..."
4,AA,5,-2.494835,GTATAACGTGATAC,GTATCACGTAATAC,"[[A, A, D, n], [M, A, D, A], [A, D, A, M], [M,...","[[A, D, A, x], [x, A, n, A], [A, n, A, x], [x,..."
...,...,...,...,...,...,...,...
1257,gg,10,-1.605242,GTATCACGTgATAC,GTATgACGTGATAC,"[[A, A, D, n], [M, A, D, A], [A, D, A, M], [M,...","[[A, D, A, x], [x, A, n, A], [A, n, A, x], [x,..."
1258,gg,11,-1.391758,GTATCACGTGgTAC,GTAgCACGTGATAC,"[[A, A, D, n], [M, A, D, A], [A, D, A, M], [M,...","[[A, D, A, x], [x, A, n, A], [A, n, A, x], [x,..."
1259,gg,12,-0.879398,GTATCACGTGAgAC,GTgTCACGTGATAC,"[[A, A, D, n], [M, A, D, A], [A, D, A, M], [M,...","[[A, D, A, x], [x, A, n, A], [A, n, A, x], [x,..."
1260,gg,13,-0.252286,GTATCACGTGATgC,GgATCACGTGATAC,"[[A, A, D, n], [M, A, D, A], [A, D, A, M], [M,...","[[A, D, A, x], [x, A, n, A], [A, n, A, x], [x,..."


In [33]:
# Save as CSV
out_dir = f'datasets/{TF}'
os.makedirs(out_dir, exist_ok=True)

df_long.to_csv(os.path.join(out_dir, f'dataset_{TF}_encoded.csv'), index=False)