# SCRIPT TO OBTAIN PHENOTIPIC BINARY TREATS FOR OPIOIDS (FENTANYL AND MORPHINE)

Control group: people that are present in OMOP_drug_era

Phenotypes cases: 
- People taking only morphine 
- People taking only fentanyl
- People taking both morhphine and fentanyl

The rest of participants: NA


## This script should only be run once

#### Initialization
##### Load packages

In [30]:
import dxdata
import dxpy
import pyspark
from pyspark.sql import functions as F

import pandas as pd

from pathlib import Path

##### Spark and dataset configuration 

In [2]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", 
    name="app*", folder="/", name_mode="glob", 
    describe=True
)["describe"]["name"]
spark.sql("USE " + dispensed_database_name)

dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [31]:
participant = dataset["participant"]
drug_era = dataset["omop_drug_era"]  # This would be our control group

# Retrieve only the 'eid' field from drug_era and participant tables
df_era = drug_era.retrieve_fields(names=["eid"], engine=dxdata.connect())
df_participant = participant.retrieve_fields(names=["eid"], engine=dxdata.connect())

#### Data
##### Retrieve the binary phenotypes

In [32]:
# 1. Load the phenotype CSV files into Pandas DataFrames
file_only_morphine = f"/mnt/project/WGS_Lucia/Data/phenotypes/only_morphine.csv"
file_only_fentanyl = f"/mnt/project/WGS_Lucia/Data/phenotypes/only_fentanyl.csv"
file_both_morphine_fentanyl = f"/mnt/project/WGS_Lucia/Data/phenotypes/both_morphine_fentanyl.csv"
only_morphine_df = pd.read_csv(file_only_morphine)
only_fentanyl_df = pd.read_csv(file_only_fentanyl)
both_morphine_fentanyl_df = pd.read_csv(file_both_morphine_fentanyl)

# Extract the 'eid' column as lists for easy access
only_morphine_eids = only_morphine_df['eid'].tolist()
only_fentanyl_eids = only_fentanyl_df['eid'].tolist()
both_morphine_fentanyl_eids = both_morphine_fentanyl_df['eid'].tolist()

# 2. Get the eids from the `drug_era` Spark DataFrame that are in the control group
control_group_df = df_era.select("eid").distinct().toPandas()
control_group_eids = control_group_df['eid'].tolist()

# 3. Convert the Spark DataFrame 'participant' to a Pandas DataFrame
participant_df = df_participant.select("eid").toPandas()

# 4. Create the phenotype columns initialized to None
participant_df["only_morphine"] = None
participant_df["only_fentanyl"] = None
participant_df["both_morphine_fentanyl"] = None

# 5. Ensure all 'eid' columns are of the same type (e.g., integer)
participant_df['eid'] = participant_df['eid'].astype(int)  
control_group_eids = [int(eid) for eid in control_group_eids] 
only_morphine_eids = [int(eid) for eid in only_morphine_eids]  
only_fentanyl_eids = [int(eid) for eid in only_fentanyl_eids]  
both_morphine_fentanyl_eids = [int(eid) for eid in both_morphine_fentanyl_eids] 

# 6. Set phenotype columns to 0 for eids that are in the control group
# For all eids in the control group, set phenotype columns to 0
participant_df.loc[participant_df['eid'].isin(control_group_eids), ['only_morphine', 'only_fentanyl', 'both_morphine_fentanyl']] = 0

# 7. Update the phenotype columns based on the 'eid' values from phenotype cases
participant_df.loc[participant_df['eid'].isin(only_morphine_eids), 'only_morphine'] = 1
participant_df.loc[participant_df['eid'].isin(only_fentanyl_eids), 'only_fentanyl'] = 1
participant_df.loc[participant_df['eid'].isin(both_morphine_fentanyl_eids), 'both_morphine_fentanyl'] = 1

In [33]:
# Check if there is a 1 in the "only_morphine" column
has_one_in_only_morphine = participant_df['only_morphine'].eq(1).any()

# Print the result
if has_one_in_only_morphine:
    print("There is at least one '1' in the 'only_morphine' column.")
else:
    print("There are no '1's in the 'only_morphine' column.")

There is at least one '1' in the 'only_morphine' column.


##### DataFrame formatting

In [34]:
display(participant_df.drop("eid", axis=1))

Unnamed: 0,only_morphine,only_fentanyl,both_morphine_fentanyl
0,0,0,0
1,,,
2,0,0,0
3,,,
4,0,0,0
...,...,...,...
502231,,,
502232,0,0,0
502233,,,
502234,,,


In [35]:
# Set the 'eid' column as the index
participant_df.set_index('eid', inplace=True)

# Insert 'FID' and 'IID' as the first two columns using the index
participant_df.insert(0, "FID", participant_df.index)
participant_df.insert(1, "IID", participant_df.index)

##### Export and upload DataFrame

In [36]:
# Save the result as a TSV file
participant_df.to_csv("phenotypes.opioids.tsv", sep='\t', index=False, na_rep="NA")

In [29]:
! dx upload phenotypes.opioids.tsv --path /WGS_Lucia/Data/phenotypes/

ID                                file-GxK0VXjJb4J581fQ9YKkpKyx
Class                             file
Project                           project-GfVK998Jb4JJgVBjKXPyxJ9q
Folder                            /WGS_Lucia/Data/phenotypes
Name                              phenotypes.opioids.tsv
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Fri Dec 13 10:13:31 2024
Created by                        luciass6
 via the job                      job-GxJzy8QJb4J3Jb0G86G06719
Last modified                     Fri Dec 13 10:13:32 2024
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"
