## Novartis Demo: Distributed Differentially Private PCA in a Secure Environment 

## Imports

In [4]:
import os 
import numpy as np
import scipy
from scipy import stats
import pandas
import matplotlib.pyplot as plt

## Supporting functions

In [5]:
# function to compute differentially-private second-moment matrix
def dp_pca_ag ( A, N, epsilon = 1.0, delta = 0.1 ):
    sigma = ( 1.0 / ( N * epsilon ) ) * np.sqrt( 2.0 * np.log( 1.25 / delta ) )
    m = A.shape[0]
    temp = np.random.normal( 0, sigma, (m, m))
    temp2 = np.triu(temp)
    temp3 = temp2.transpose()
    temp4 = np.tril(temp3, -1)
    E = temp2 + temp4
    hat_A = A + E
    return hat_A

# function to compute the second-moment matrix of local data
def localPCA(data, epsilon = 0, delta = 0):
    N = data.shape[1] # Number of features 
    C = (1/N) * np.dot(data, data.T)
    if not(epsilon == 0):
        C = dp_pca_ag( C, N, epsilon, delta )
    return C

# function to compute the global second-moment matrix and PCA subspace
def globalPCA(Cs, K):
    C = 0
    S = len(Cs)
    for s in range(S):
        C += Cs[s]
    C = (1/S) * C
    U, S, V = np.linalg.svd(C)
    Uk = U[:, :K]
    return Uk

## Load data

In [7]:
datadir = "~/Novartis/201903_testcase"

In [8]:
iHMPall_MSP_df = pandas.read_csv(
    f"{datadir}/iHMPall_MSP_df.tsv.gz",
    index_col=[0,1], 
    sep = "\t")

iHMPall_metadata_df = pandas.read_csv(
    f"{datadir}/iHMPall_metadata_df.tsv.gz",
    index_col=0,
    sep = "\t")

iHMPall_norm_df = pandas.read_csv(
    f"{datadir}/iHMPall_norm_df.tsv.gz",
    index_col=0,
    sep = "\t")

## Subset relevant samples per Hostipals per Diagnosis 

In [37]:
i_nonIBD_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "nonIBD", ["study|subset","participantID", "diagnosis", "visit_num"]]\
    .reset_index()\
    .dropna(axis="index", how="any")\
    .groupby("participantID").first().loc[:,"index"].tolist() #all sample test code for diagnosis nonIBD without duplicates

i_UC_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "UC", ["study|subset","participantID", "diagnosis", "visit_num"]]\
    .reset_index()\
    .dropna(axis="index", how="any")\
    .groupby("participantID").first().loc[:,"index"].tolist() #all sample test code for diagnosis UC without dupli

i_CD_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "CD", ["study|subset","participantID", "diagnosis", "visit_num"]]\
    .reset_index()\
    .dropna(axis="index", how="any")\
    .groupby("participantID").first().loc[:,"index"].tolist()

In [63]:
i_CD_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "CD", ["study|subset","participantID", "diagnosis", "visit_num"]]

i_CD_baseline_CedarsSinai = i_CD_baseline.loc[i_CD_baseline["study|subset"] == "HMP2|Cedars-Sinai",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()
i_CD_baseline_Emory = i_CD_baseline.loc[i_CD_baseline["study|subset"] == "HMP2|Emory",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()
i_CD_baseline_Cincinnati = i_CD_baseline.loc[i_CD_baseline["study|subset"] == "HMP2|Cincinnati",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()
i_CD_baseline_MGH = i_CD_baseline.loc[i_CD_baseline["study|subset"] == "HMP2|MGH",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()
i_CD_baseline_MGH_Pediatrics = i_CD_baseline.loc[i_CD_baseline["study|subset"] == "HMP2|MGH Pediatrics",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_UC_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "UC", ["study|subset","participantID", "diagnosis", "visit_num"]]

i_UC_baseline_CedarsSinai = i_UC_baseline.loc[i_UC_baseline["study|subset"] == "HMP2|Cedars-Sinai",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_UC_baseline_Emory = i_UC_baseline.loc[i_UC_baseline["study|subset"] == "Emory",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_UC_baseline_Cincinnati = i_UC_baseline.loc[i_UC_baseline["study|subset"] == "HMP2|Cincinnati",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_UC_baseline_MGH = i_UC_baseline.loc[i_UC_baseline["study|subset"] == "HMP2|MGH",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_UC_baseline_MGH_Pediatrics = i_UC_baseline.loc[i_UC_baseline["study|subset"] == "MGH Pediatrics",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_nonIBD_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "nonIBD", ["study|subset","participantID", "diagnosis", "visit_num"]]

i_nonIBD_baseline_CedarsSinai = i_nonIBD_baseline.loc[i_nonIBD_baseline["study|subset"] == "HMP2|Cedars-Sinai",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_nonIBD_baseline_Emory = i_nonIBD_baseline.loc[i_nonIBD_baseline["study|subset"] == "HMP2|Emory",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_nonIBD_baseline_Cincinnati = i_nonIBD_baseline.loc[i_nonIBD_baseline["study|subset"] == "HMP2|Cincinnati",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_nonIBD_baseline_MGH = i_nonIBD_baseline.loc[i_nonIBD_baseline["study|subset"] == "HMP2|MGH",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()

i_nonIBD_baseline_MGH_Pediatrics = i_nonIBD_baseline.loc[i_nonIBD_baseline["study|subset"] == "HMP2|MGH Pediatrics",:]\
                            .reset_index()\
                            .dropna(axis="index", how="any")["index"].tolist()