# Exploring the DESeq2 data from the PACE cohort
### Matthew Muller
11/24/2022


## Setup

In [1]:
##########
# Library Imports
import pandas as pd
import numpy as np
import os
from pprint import pprint
from sklearn.preprocessing import label_binarize

##########
# Set Working directory

# %cd ..

##########
# Import Functions


## Data Cleaning

### Pace Data

In [2]:
## Training Data
# pace_features = pd.read_csv('data/hyper_feature_outtable.csv', index_col=0) # median of ratios counts of genes # this is press
pace_features = pd.read_csv('../platelet-pace/output/hyper_geneset_creation/run15_hyper60_hypo40_AGRCONTROL/hyper_feature_outtable.csv', index_col=0) # median of ratios counts of genes # this is press_1
# pace_features = pd.read_csv('/Users/muller/Ruggles Lab Dropbox/Matthew Muller/projects/platelet-pace/output/hyper_geneset_creation/run15_hyper60_hypo40_AGRCONTROL/hyper_feature_outtable.csv', index_col=0) # median of ratios counts of genes # this is press_2


# Metadata
pace_metadata = pd.read_csv('data/hypercohort_metatable.csv') # Metadata on patients

# Subset of 3 key genes that Jeffrey found separately
key_3_genes = pd.read_csv('data/clean/key_3_genes.csv', header=None) # All Up and Down regulated genes
top_20_genes = pd.read_csv('data/clean/model_top20_genes.csv', header=0) # Genes selected by the forest model


# Low dose LTA values for regression making
pace_lta = pace_features[['epi_04um_300s_n']].rename({'epi_04um_300s_n':'Low-dose LTA'}, axis=1)

FileNotFoundError: [Errno 2] No such file or directory: '../platelet-pace/output/hyper_geneset_creation/run15_hyper60_hypo40_AGRCONTROL/hyper_feature_outtable.csv'

### Duke Data

In [None]:
## Validation Data
duke_features = pd.read_csv('data/duke_validation_run3/normcounttab_genesymbols.csv', index_col=0)

# subset the duke features
duke_features = duke_features.T[ duke_features.columns.isin(pace_features.columns) ].T 

# subset the pace features now
tmp_columns = set(pace_features.columns) - set(duke_features.columns)
for gene in tmp_columns:
    duke_features[gene]=0
pace_features_clean = pace_features.drop(['epi_04um_300s_n', 'hypercohort'], axis=1)

# pace_features_clean = pace_features.T[ pace_features.columns.isin(duke_features.columns) ].T 
# pace_features_clean = pace_features.T[ pace_features.columns.isin(pace_all_genes['0']) ].T # Needed if using the forest model genes

pace_labels_clean = label_binarize(pace_features[['hypercohort']], # Clean up labels
                                   classes=["nothyper", "hyper"])

# subset the duke features again
duke_features = duke_features.T[ duke_features.columns.isin(pace_features_clean.columns) ].T 

# The duke labels are split into group 1 and group 2
# Both of these groups are not given medication (1)
#  or not on medication at the given time (2) EDIT: group 2 is garbage
duke_labels_group_1 = pd.read_csv('data/duke_validation_run3/cohort_descriptions/group1plottable1.csv', 
                                  index_col=0)[['compouttable[, 2]']] # group 1
duke_labels_group_2 = pd.read_csv('data/duke_validation_run3/cohort_descriptions/group2plottable1.csv', index_col=0)[['compouttable[, 2]']] # group 2 (this group had NAs)

# divide up the duke_features dataframe to make group 1 and group 2
# subset the groups
duke_features_group_1 = duke_features[ duke_features.index.isin(duke_labels_group_1.index) ] 
duke_features_group_2 = duke_features[ duke_features.index.isin(duke_labels_group_2.index) ]
# sort the groups
duke_features_group_1 = duke_features_group_1.reindex(sorted(duke_features_group_1.columns), axis=1) 
duke_features_group_2 = duke_features_group_2.reindex(sorted(duke_features_group_2.columns), axis=1)

## Regression Low-dose LTA values
duke_metadata = pd.read_csv('data/duke_validation_run3/dukemetatable_sel.csv')
duke_group_1_metadata = duke_metadata.loc[ duke_metadata['cohort']== 'group1' ]
duke_group_1_lta = duke_group_1_metadata[['characteristic__epi_max_05']].rename({'characteristic__epi_max_05':'Low-dose LTA'})



##### Determine a duke cohort consistent between group 1 and 2 ######
# ie. a longitudinal duke cohort

# duke_intersection
duke_metadata_g1 = duke_metadata.loc[(duke_metadata['cohort'] == 'group1')].set_index('characteristic__subject_id').sort_index()
duke_metadata_g2 = duke_metadata.loc[(duke_metadata['cohort'] == 'group2')].set_index('characteristic__subject_id').sort_index()
group_intersect = list(set(duke_metadata_g1.index).intersection(set(duke_metadata_g2.index)))
duke_metadata_g1 = duke_metadata_g1.loc[group_intersect]
duke_metadata_g2 = duke_metadata_g2.loc[group_intersect]


duke_hyper = duke_metadata_g1.loc[(duke_metadata_g1['characteristic__epi_max_05'] > 60) & (duke_metadata_g2['characteristic__epi_max_05'] > 60)]
duke_norm = duke_metadata_g1.loc[(duke_metadata_g1['characteristic__epi_max_05'] < 40) & (duke_metadata_g2['characteristic__epi_max_05'] < 40)]

subjects = pd.concat([duke_hyper, duke_norm])['Unnamed: 0']

duke_features_group_1 = duke_features_group_1.loc[subjects]
duke_labels_group_1 = duke_labels_group_1.loc[subjects]




duke_hyper = duke_metadata_g2.loc[(duke_metadata_g1['characteristic__epi_max_05'] > 60) & (duke_metadata_g2['characteristic__epi_max_05'] > 60)]
duke_norm = duke_metadata_g2.loc[(duke_metadata_g1['characteristic__epi_max_05'] < 40) & (duke_metadata_g2['characteristic__epi_max_05'] < 40)]
subjects2 = pd.concat([duke_hyper, duke_norm])['Unnamed: 0']

duke_features_group_2 = duke_features_group_2.loc[subjects2]
duke_labels_group_2 = duke_labels_group_2.loc[subjects2]

In [None]:
## Pull the Duke metadata for Tessa for a figure update.
# Probably not needed otherwise.
# duke_hyper = duke_metadata_g1.loc[(duke_metadata_g1['characteristic__epi_max_05'] > 60) & (duke_metadata_g2['characteristic__epi_max_05'] > 60)]
# duke_norm = duke_metadata_g1.loc[(duke_metadata_g1['characteristic__epi_max_05'] < 40) & (duke_metadata_g2['characteristic__epi_max_05'] < 40)]

# subjects = pd.concat([duke_hyper, duke_norm])
# subjects['label'] = list(duke_labels_group_1['compouttable[, 2]'].map({1:'hyper', 0:'normal'}))

# subjects.to_csv('/Users/muller/Desktop/duke_samples.csv')

### Duke and Pace Combined

In [None]:
## Combined Dataset to test how well we can perform here.
# Make a merged duke labels set

# Removed Duke group 2 due to messy data
# duke_labels = duke_labels_group_1.T.merge(duke_labels_group_2.T, left_index=True, right_index=True).T
# features = duke_features.T.merge(pace_features_clean.T, right_index=True, left_index=True).T
# labels = pd.concat([ duke_labels['compouttable[, 2]'], pace_features['hypercohort'].map({'nothyper':0, 'hyper':1}) ])

features = duke_features_group_1.T.merge(pace_features_clean.T, right_index=True, left_index=True).T
labels = pd.concat([ duke_labels_group_1['compouttable[, 2]'], pace_features['hypercohort'].map({'nothyper':0, 'hyper':1}) ])

### Look at subsets of PRESS dataset

In [None]:
## Look at the key_3_genes features
# right now I am missing one gene?
key_pace_features = pace_features.T[ pace_features.columns.isin(key_3_genes.values.flatten()) ].T
key_duke_features = duke_features_group_1.T[ duke_features_group_1.columns.isin(key_pace_features.columns) ].T

## Export Data

In [None]:
## Clean counts in a matrix for input into SKLearn
# Pace Data (Training)
# pace_features_clean.to_csv('data/clean/pace/features.csv', index=None)
pace_features_clean.to_csv('data/clean/pace/features.csv')

pd.DataFrame(pace_labels_clean).to_csv('data/clean/pace/labels.csv', index=None)

# Duke Data (Validation)
duke_features_group_1.to_csv('data/clean/duke/features_group1.csv', index=None)
duke_features_group_2.to_csv('data/clean/duke/features_group2.csv', index=None)

duke_labels_group_1.to_csv('data/clean/duke/labels_group1.csv', index=None)
duke_labels_group_2.to_csv('data/clean/duke/labels_group2.csv', index=None)

# duke_features.to_csv('data/clean/duke/features.csv', index=None)
# duke_labels.to_csv('data/clean/duke/labels.csv', index=None)


# Key 3 Gene Features
key_pace_features.to_csv('data/clean/pace/key_features.csv', index=None)
key_duke_features.to_csv('data/clean/duke/key_features.csv', index=None)


## Send to csv
features.to_csv('data/clean/combined_features.csv', index=None)
labels.to_csv('data/clean/combined_labels.csv', index=None)


## Regression truth values
pace_lta.to_csv('data/clean/pace/lta_values.csv', index=None)
duke_group_1_lta.to_csv('data/clean/duke/lta_values.csv', index=None)

## Take a look

In [None]:
### Take a look at things so far:
print(f' Duke 1 Features: {duke_features_group_1.shape} \n',
      f'Duke 1 Labels  : {duke_labels_group_1.shape} \n')

print(f' Duke 2 Features: {duke_features_group_2.shape} \n',
      f'Duke 2 Labels  : {duke_labels_group_2.shape} \n')

print(f' Pace Features: {pace_features_clean.shape} \n',
      f'Pace Labels  : {pace_labels_clean.shape} \n')

print(f' Pace Key Features  : {key_pace_features.shape} \n',
      f'Duke Key Features : {key_duke_features.shape} \n')

# Save the genes just in case
genes = duke_features.columns.to_numpy()
np.savetxt("data/clean/press_genes.csv",
           genes, delimiter=", ", fmt ='% s')

# Save the group of consistent hyper dukers
# subjects.to_csv('data/clean/duke_longitudinal_group.csv')

pd.concat([duke_hyper, duke_norm])[['Unnamed: 0', 'characteristic__epi_max_05']].reset_index().to_csv('output/data-wrangling__2023-02-15/duke_group1_group2_longitudinal.csv')

duke_labels_group_2

 Duke 1 Features: (35, 451) 
 Duke 1 Labels  : (35, 1) 

 Duke 2 Features: (35, 451) 
 Duke 2 Labels  : (35, 1) 

 Pace Features: (84, 451) 
 Pace Labels  : (84, 1) 

 Pace Key Features  : (84, 2) 
 Duke Key Features : (35, 2) 



Unnamed: 0,"compouttable[, 2]"
DVS132,1.0
DV.S47R,1.0
DV.S62,1.0
DVS.85,1.0
DVS.114,1.0
DVS.120R,1.0
DVS.147R,1.0
DVS.159,1.0
DVS.182,1.0
DVS192,1.0
