In [1]:
import os
import glob
import pandas as pd
import re

In [6]:
def find_pathways(input_folder, search_df):
    records = []

    for file_path in glob.glob(os.path.join(input_folder, '*.txt')):
        file_name = os.path.splitext(os.path.basename(file_path))[0]

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if not line.strip():
                    continue

                parts = line.strip().split('\t')
                if len(parts) < 3:
                    continue

                pathway = parts[0]
                genes = parts[2:]
                genes_str = ', '.join([g for g in genes if g.strip()])

                for _, row in search_df.iterrows():
                    term = str(row['searchTerms']).strip().lower()
                    pattern = r'\b' + term
                    if re.search(pattern, pathway.lower()):
                        records.append({
                            'pathwayName': pathway,
                            'searchTerms': term,
                            'genes': genes_str,
                            'library': file_name,
                            'diseaseId': row.get('diseaseId'),
                            'name': row.get('name'),
                            'therapeuticAreas': row.get('therapeuticAreas')
                        })

    return pd.DataFrame(records, columns=[
        'pathwayName', 'searchTerms', 'genes', 'library',
        'diseaseId', 'name', 'therapeuticAreas'
    ])


In [7]:
reactome_diseases = pd.read_csv('/home/polina/genesets2evidence/disease_list/reactome_dis_terms_curated.csv')

In [14]:
gene_sets = '/home/polina/genesets2evidence/gene_sets/pathways_extended'

In [15]:
disease_pathways = find_pathways(gene_sets, reactome_diseases)

In [16]:
disease_pathways_grouped = disease_pathways.groupby('pathwayName').agg(lambda x: ','.join(sorted(set(x)))).reset_index()

In [17]:
# disease_pathways_grouped.to_csv('disease_pathways_v1.csv', index=False, sep='\t')

In [18]:
disease_pathways_grouped['genes'] = disease_pathways_grouped['genes'].str.split(', ')
disease_pathways_exploded = disease_pathways_grouped.explode('genes', ignore_index=True)

In [None]:
disease_pathways_exploded.to_csv('target_lists/v2_word_start/from_reactome_dis_v1.2_pathways_extended.csv', index=False)

### Parse genenames to filter out non-gene targets

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import col, explode, split, collect_set, concat_ws, lit, filter, when, concat_ws
from pyspark.sql import DataFrame
import pandas as pd

In [22]:
spark = SparkSession.builder.getOrCreate()

In [23]:
# Target info from OT platform to parse gene names
target_path = "gs://open-targets-data-releases/25.03/output/target/"
target = spark.read.parquet(target_path)

                                                                                

In [24]:
def join_dataframes(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_column: str, 
                    second_key_column: str,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_column (str): The key column name in the initial DataFrame.
    second_key_column (str): The key column name in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Selecting specified columns from the second DataFrame, including its key column
    second_df_selected = second_df.select([second_key_column] + columns_to_join)

    second_columns_to_join_with_alias = ["b." + col for col in columns_to_join]

    return initial_df.alias("a")\
        .join(second_df_selected.alias("b"), 
            on = initial_df[initial_key_column] == second_df_selected[second_key_column], 
            how='left')\
        .select("a.*", *second_columns_to_join_with_alias)\
        .persist()

In [25]:
geneset_evidence1 = pd.read_csv('target_lists/v2_word_start/from_reactome_dis_v1.2_pathways_extended.csv', sep=',', header=0)
geneset_evidence1_spark = spark.createDataFrame(geneset_evidence1)

In [26]:
geneset_evidence2 = pd.read_csv('target_lists/v2_word_start/from_reactome_dis_v2.2_pathways_only.csv', sep=',', header=0)
geneset_evidence2_spark = spark.createDataFrame(geneset_evidence2)

In [27]:
target_list = ["id"]

geneset_evidence1_genenames = join_dataframes(geneset_evidence1_spark, target, "genes", "approvedSymbol", target_list).persist()

25/06/19 14:35:49 WARN CacheManager: Asked to cache already cached data.


In [28]:
geneset_evidence2_genenames = join_dataframes(geneset_evidence2_spark, target, "genes", "approvedSymbol", target_list).persist()

25/06/19 14:35:52 WARN CacheManager: Asked to cache already cached data.


In [29]:
geneset_evidence1_genenames.toPandas().to_csv('target_lists/v2_word_start/from_reactome_dis_v2.2_pathways_only_genenames.csv', index=False, sep=',')

                                                                                

In [30]:
geneset_evidence2_genenames.toPandas().to_csv('target_lists/v2_word_start/from_reactome_dis_v2.2_pathways_only_genenames.csv', index=False, sep=',')

                                                                                