In [1]:
import pickle, gimmemotifs, pandas as pd, numpy as np, re, os
from gimmemotifs.motif import default_motifs
wd = '/ocean/projects/cis240075p/skeshari/igvf/bcell2/male_donor/'
out_path = os.path.join(wd, 'out_data', 'cicero_lf_enrich')

In [10]:
input_dict = {
    'experiment': ['PRDM1_KO', 'IRF4_KO', 'GC_PB', 'PB_ABC', 'GC_ABC'],
    'slide_starting_genes': [4472, 4500, 4725, 3420, 4603],
    'clusters_of_interest': [['1','2','3','7'], ['1','2','3','7'], ['7','3'], ['1','7'], ['1','3']],
    'order_fr_clust': [[1], [1], [2], [2], [2]],
    'order_fr_tfcomb': [[1], [1], [1,2], [2], [2]],
    'weight': ['strength', 'strength', 'strength', 'strength', 'strength'],
}
input_df = pd.DataFrame(input_dict)
#### Assign the input parameters
i=2
experiment = input_df['experiment'][i]
slide_starting_genes = input_df['slide_starting_genes'][i]
clusters_of_interest = input_df['clusters_of_interest'][i]
order_fr_clust = input_df['order_fr_clust'][i]
order_fr_tfcomb = input_df['order_fr_tfcomb'][i]
weight = input_df['weight'][i]
#### Read TFs
cluster_fusion = ('7','3')
slide_tot_TF = pd.read_csv(f"{wd}/out_data/lf_enrich/out_files/SLIDE_LF_{cluster_fusion}_{experiment}.csv", header=0, index_col=0).index.values
net_match_TF = pd.read_csv(f"{wd}/out_data/lf_enrich/out_files/Net_match_{cluster_fusion}_{experiment}.csv", header=0, index_col=0).index.values
net_rnd_TF = pd.read_csv(f"{wd}/out_data/lf_enrich/out_files/Net_rnd_{cluster_fusion}_{experiment}.csv", header=0, index_col=0).index.values
total_TFs = set(slide_tot_TF).union(set(net_match_TF)).union(set(net_rnd_TF))

In [11]:
len(total_TFs)

89

In [15]:
dict_motif2TFs ={}
motifs= default_motifs()
species = "Human"

factor_names = ['direct', 'indirect\nor predicted']

for i in motifs:
    fcs = []
    for j in factor_names:
        fcs += i.factors[j]
    dict_motif2TFs[i.id] = fcs

if species in ["Mouse", "Rat"]:
    for key in dict_motif2TFs.keys():
        dict_motif2TFs[key] = [tf.capitalize() for tf in dict_motif2TFs[key]]

elif species in ["Human", "S.cerevisiae", "Arabidopsis", "Axolotl"]:
    for key in dict_motif2TFs.keys():
        dict_motif2TFs[key] = [tf.upper() for tf in dict_motif2TFs[key]]

elif species in ["Zebrafish", "Xenopus"]:
    for key in dict_motif2TFs.keys():
        dict_motif2TFs[key] = [tf.lower() for tf in dict_motif2TFs[key]]

elif species in ["Drosophila", "C.elegans"]:
    pass

dict_TFs2motif = {}
for key, value in dict_motif2TFs.items():
    for i in value:
        if i not in dict_TFs2motif.keys():
            dict_TFs2motif[i] = [key]
        else:
            dict_TFs2motif[i].append(key)

### Function to process motif databases

In [16]:
def save_gimme_pwm_to_homer(output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for motif_id in motifs:
        with open(os.path.join(output_dir, f"{motif_id.id}.motif"), 'w') as outfile:
                outfile.write(motif_id.to_pwm()[0]+'\t'+motif_id.to_pwm()[1:]+ '\n')
    return None

def split_hocomoco_flat_motifs(input_file, output_dir):
    tf=[]
    os.makedirs(output_dir, exist_ok=True)
    with open(input_file, 'r') as infile:
        motif_name = None
        motif_lines = []

        for line in infile:
            line = line.strip()
            
            # New motif detected
            if line.startswith(">"):
                # Save previous motif if one exists
                if motif_name and motif_lines:
                    with open(os.path.join(output_dir, f"{motif_name}.motif"), 'w') as outfile:
                        outfile.write("\n".join(motif_lines) + "\n")
                
                # Start new motif
                motif_name = line.split()[1].split('.')[0]
                motif_lines = [line]  # Store the header line
                tf.append(motif_name)
            
            else:
                motif_lines.append(line)  # Add matrix lines
        
        # Save last motif in the file
        if motif_name and motif_lines:
            with open(os.path.join(output_dir, f"{motif_name}.motif"), 'w') as outfile:
                outfile.write("\n".join(motif_lines) + "\n")

    print(f"Motifs saved in: {output_dir}")
    return tf

def split_meme_flat_motifs(input_file, output_dir):
    tf=[]
    os.makedirs(output_dir, exist_ok=True)
    with open(input_file, 'r') as infile:
        motif_name = None
        read_matrix = False
        motif_lines = []
        for line in infile:
            line = line.strip()
            # New motif detected
            if line.startswith("MOTIF"):
                motif_name = line.split()[2]
                if '/' in motif_name:
                    motif_name = None
                else:
                    tf.append(motif_name)
            # Start reading the matrix
            elif (motif_name is not None) and (line.startswith("letter-probability matrix")):
                read_matrix = True  # Start reading matrix
            # If matrix reading is active, write the lines
            elif (motif_name is not None) and read_matrix and line:
                motif_lines.append(line)  # Add matrix lines
            # Stop reading when an empty line is encountered
            elif (motif_name is not None) and read_matrix and line == "":
                with open(os.path.join(output_dir, f"{motif_name}.motif"), 'w') as outfile:
                    outfile.write(f">\t{motif_name}\n")  # Motif header
                    outfile.write("\n".join(motif_lines) + "\n")
                read_matrix = False  # Stop reading
                motif_name = None  # Reset motif name
                motif_lines = []
    return tf

def cisbp_to_homer(cisbp_file, outfile, tf):
    df = pd.read_csv(cisbp_file, sep="\t")
    if "Pos" in df.columns:
        df = df.iloc[:, 1:]  # Keep only A, C, G, T columns

    outfile.write(f">\t{tf}\n")  # Motif header
    df.to_csv(outfile, sep=" ", index=False, header=False, float_format="%.6f")

    return None

### Function to write the concatenated pwm

In [20]:
flatten_hocomoco = False
flatten_meme = True
write_gimme = True
# Reading motif files
motif_cluster = pd.read_csv(f'{wd}/out_data/out_other_methods/motifs/jaspar/jaspar2024_motif_cluster_annotations.csv')
motif_cluster[['Motif','ID','Name']] = motif_cluster['Motif'].str.split('_', expand=True)
motif_cluster[['Cluster','cluster_number']] = motif_cluster['cluster_number'].str.split('_', expand=True)
motif_cluster = motif_cluster[['ID','Name','cluster_number']]
#'TF_Information.txt' contains, for each TF, all directly determined motifs (see below). 
# If a TF does not have a directly determined motif, this file will also include its best inferred motif.  
# 'Best' is defined as the motif(s) obtained from the most similar TF (based on the %ID in the amino acids of its TF) that has a directly determined motif.
TF_info_c = pd.read_csv(f"{wd}/out_data/out_other_methods/motifs/cisbp/Homo_sapiens_2025_03_16_3_27_pm/TF_Information.txt", sep='\t')
TF_info_c = TF_info_c[TF_info_c['TF_Status']!='N'].reset_index(drop=True) # No motif Available
TF_info_m = pd.read_csv(f"{wd}/out_data/out_other_methods/motifs/cisbp/Mus_musculus_2025_03_16_4_09_pm/TF_Information.txt", sep='\t')
TF_info_m = TF_info_m[TF_info_m['TF_Status']!='N'].reset_index(drop=True) # No motif Available

TF_info_l = pd.read_csv(f"{wd}/out_data/out_other_methods/motifs/CRC_lin_lab/MotifDictionary.txt", sep='\t', header=None)
if write_gimme:
    save_gimme_pwm_to_homer(f"{wd}/out_data/out_other_methods/motifs/gimme/pwm_all_motifs")

if flatten_hocomoco:
    TF_info_h = split_hocomoco_flat_motifs( f"{wd}/out_data/out_other_methods/motifs/hocomoco/H13CORE_homer_format_0.001.motif", \
                                f"{wd}/out_data/out_other_methods/motifs/hocomoco/H13CORE_homer_format_0.001/pwm_all_motifs")
else:
    TF_info_h = []
    with open(f"{wd}/out_data/out_other_methods/motifs/hocomoco/H13CORE_homer_format_0.001.motif", "r") as file:
        for line in file:
            if line.startswith(">"):
                match = re.search(r'([A-Z0-9]+)\.', line)
                if match:
                    TF_info_h.append(match.group(1))
if flatten_meme:
    TF_info_l = split_meme_flat_motifs( f"{wd}/out_data/out_other_methods/motifs/CRC_lin_lab/VertebratePWMs.txt", \
                            f"{wd}/out_data/out_other_methods/motifs/CRC_lin_lab/pwm_all_motifs")
else:
    TF_info_l =[]
    with open(f"{wd}/out_data/out_other_methods/motifs/CRC_lin_lab/VertebratePWMs.txt", "r") as file:
        if line.startswith("MOTIF"):
            motif_name = line.split()[2]
            if '/' in motif_name:
                TF_info_l.append(motif_name)

# with open(f"{wd}/out_data/co_analysis_1/intermediate_data/TF_to_TG_dictionary.pkl", 'rb') as f:
#     TF_to_TG_dict = pickle.load(f)

no_tf_motif, tf_motif = [], {}
output_file = f"{out_path}/out_files/concatenated_pwm.txt"
with open(output_file, 'w') as outfile:
    for tf in total_TFs: #list(TF_to_TG_dict.keys()):

        if tf in dict_TFs2motif.keys() and len(dict_TFs2motif[tf])!=0:
            for id in dict_TFs2motif[tf]:
                file_name_motif = f'{id}.motif'
                with open(os.path.join(f'{wd}/out_data/out_other_methods/motifs/gimme/pwm_all_motifs/{file_name_motif}'), 'r') as infile:
                    outfile.write(infile.read())
            tf_motif[tf] = dict_TFs2motif[tf]

        elif tf.capitalize() in dict_TFs2motif.keys() and len(dict_TFs2motif[tf.capitalize()])!=0: #Gimme_Mus
            for id in dict_TFs2motif[tf.capitalize()]:
                file_name_motif = f'{id}.motif'
                with open(os.path.join(f'{wd}/out_data/out_other_methods/motifs/gimme/pwm_all_motifs/{file_name_motif}'), 'r') as infile:
                    outfile.write(infile.read())
            tf_motif[tf] = dict_TFs2motif[tf.capitalize()]

        elif tf in dict_TFs2motif.keys() and len(dict_TFs2motif[tf])!=0:
            for id in dict_TFs2motif[tf]:
                file_name_motif = f'{id}.motif'
                with open(os.path.join(f'{wd}/out_data/out_other_methods/motifs/gimme/pwm_all_motifs/{file_name_motif}'), 'r') as infile:
                    outfile.write(infile.read())
            tf_motif[tf] = dict_TFs2motif[tf]

        elif tf.capitalize() in dict_TFs2motif.keys() and len(dict_TFs2motif[tf.capitalize()])!=0: #Gimme_Mus
            for id in dict_TFs2motif[tf.capitalize()]:
                file_name_motif = f'{id}.motif'
                with open(os.path.join(f'{wd}/out_data/out_other_methods/motifs/gimme/pwm_all_motifs/{file_name_motif}'), 'r') as infile:
                    outfile.write(infile.read())
            tf_motif[tf] = dict_TFs2motif[tf.capitalize()]

        elif tf in list(motif_cluster['Name'].str.upper()): #JASPAR-homer
            location = np.where(tf==motif_cluster['Name'].str.upper().values)[0][0]
            file_name_motif = motif_cluster.loc[location]['Name']+'.'+motif_cluster.loc[location]['ID']+'.motif'
            with open(os.path.join(f'{wd}/out_data/out_other_methods/motifs/jaspar/JASPAR2024_CORE_vertebrates_non-redundant_homer/{file_name_motif}'), 'r') as infile:
                outfile.write(infile.read())
            tf_motif[tf] = motif_cluster.loc[location]['Name']

        elif tf in list(TF_info_c['TF_Name'].str.upper()): #CISBP
            location = np.where(tf==TF_info_c['TF_Name'].str.upper().values)[0][0]
            file_name_motif = TF_info_c.loc[location]['Motif_ID']+'.txt'
            cisbp_file = os.path.join(f'{wd}/out_data/out_other_methods/motifs/cisbp/Homo_sapiens_2025_03_16_3_27_pm/pwms_all_motifs/{file_name_motif}')
            cisbp_to_homer(cisbp_file, outfile,TF_info_c.loc[location]['TF_Name'])
            tf_motif[tf] = TF_info_c.loc[location]['TF_Name']

        elif tf in pd.Series(TF_info_h).str.upper().values: #HOCOMOCO-homer
            location = np.where(tf==pd.Series(TF_info_h).str.upper().values)[0][0]
            file_name_motif = TF_info_h[location]+'.motif'
            with open(os.path.join(f"{wd}/out_data/out_other_methods/motifs/hocomoco/H13CORE_homer_format_0.001/pwm_all_motifs/{file_name_motif}"), 'r') as infile:
                outfile.write(infile.read())
            tf_motif[tf] = TF_info_h[location]

        elif tf in pd.Series(TF_info_l).str.upper().values: #LinLab-meme
            location = np.where(tf==pd.Series(TF_info_l).str.upper().values)[0][0]
            file_name_motif = TF_info_l[location]+'.motif'
            with open(os.path.join(f"{wd}/out_data/out_other_methods/motifs/CRC_lin_lab/pwm_all_motifs/{file_name_motif}"), 'r') as infile:
                outfile.write(infile.read())
            tf_motif[tf] = TF_info_l[location]

        elif tf in list(TF_info_m['TF_Name'].str.upper()): #CISBP_Mus
            location = np.where(tf==TF_info_m['TF_Name'].str.upper().values)[0][0]
            file_name_motif = TF_info_m.loc[location]['Motif_ID']+'.txt'
            cisbp_file = os.path.join(f'{wd}/out_data/out_other_methods/motifs/cisbp/Mus_musculus_2025_03_16_4_09_pm/pwms_all_motifs/{file_name_motif}')
            cisbp_to_homer(cisbp_file, outfile, TF_info_m.loc[location]['TF_Name'])
            tf_motif[tf] = TF_info_m.loc[location]['TF_Name']

        else:
            no_tf_motif.append(tf)
print(len(no_tf_motif), len(tf_motif))
# with open(f"{out_path}/out_files/tf_motif.pkl", 'wb') as f:
#     pickle.dump(tf_motif, f)

11 78
