In [1]:
import os
import glob
import pandas as pd

In [15]:
def find_pathways(input_folder, search_df):
    # Load search terms and associated metadata
    records = []

    # Go through each .txt file with gene sets
    for file_path in glob.glob(os.path.join(input_folder, '*.txt')):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if not line.strip():
                    continue

                parts = line.strip().split('\t')
                if len(parts) < 3:
                    continue  # malformed line
                
                pathway = parts[0]
                genes = parts[2:]
                genes_str = ', '.join([g for g in genes if g.strip()])

                # Match each search term with the pathway name
                for _, row in search_df.iterrows():
                    term = str(row['searchTerms'])
                    if term.lower() in pathway.lower():
                        records.append({
                            'pathwayName': pathway,
                            'searchTerms': term,
                            'genes': genes_str,
                            'library': file_name,
                            'diseaseId': row.get('diseaseId'),
                            'name': row.get('name'),
                            'therapeuticAreas': row.get('therapeuticAreas')
                        })

    return pd.DataFrame(records, columns=[
        'pathwayName', 'searchTerms', 'genes', 'library',
        'diseaseId', 'name', 'therapeuticAreas'
    ])

In [16]:
reactome_diseases = pd.read_csv('/home/polina/genesets2evidence/disease_list/reactome_dis_terms_curated.csv')

In [17]:
gene_sets = '/home/polina/genesets2evidence/gene_sets'

In [None]:
disease_pathways = find_pathways(gene_sets, reactome_diseases)

In [8]:
disease_pathways_grouped = disease_pathways.groupby('pathwayName').agg(lambda x: ','.join(sorted(set(x)))).reset_index()

In [9]:
# disease_pathways_grouped.to_csv('disease_pathways_v1.csv', index=False, sep='\t')

In [10]:
disease_pathways_grouped['genes'] = disease_pathways_grouped['genes'].str.split(', ')
disease_pathways_exploded = disease_pathways_grouped.explode('genes', ignore_index=True)

In [11]:
disease_pathways_exploded.to_csv('AD_counts_vs_OT/AD_v1.txt', index=False, sep='\t')