just get the raw counts of the EPO cohort for subsequent TMM normalization in R. 

In [1]:
import pandas as pd
import scanpy as sc
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns


hfont = {'fontname':'Helvetica'}
sns.set(font_scale = 2)
sns.set_style(style='white')


np.random.seed(31)

# 09222021_filterBadSamples_epoOnly.ipynb

In [2]:
"""
@returns passingSamps, counts df whose samples do not exceed 95th percentile
for any of the ribosomal read fraction, 3' bias, or intron/exon ratio
"""
def sampleQC(cts, intronPath, riboPath, prime3Path):
    intronExon = pd.read_csv(intronPath, sep = "\t", index_col = 0).dropna()
    riboFrac = pd.read_csv(riboPath, sep = ",", index_col = 0).dropna()
    threePrimeBias = pd.read_csv(prime3Path, sep = "\t", index_col = 0).dropna()
    
    # from moufarrej et al frontiers in pediatrics
    ribo = 0.2 
    deg = 0.4
    intron = 3
    
    # DISCARD SAMPLES > THRESHOLDS
    discardIntron = intronExon[intronExon.iloc[:,0] > intron]
    discardRibo = riboFrac[riboFrac.iloc[:,0] > ribo]
    discardBias = threePrimeBias[threePrimeBias.iloc[:,0] > deg]

    bad = list(set(discardBias.index.tolist() + 
                   discardIntron.index.tolist() + discardRibo.index.tolist()))
   
    goodSamps = np.setdiff1d(cts.columns.tolist(), bad)

    passingSamps = cts[goodSamps]
    return(passingSamps)

In [3]:
def dropZeroGenes(df):
    return(df.loc[~(df==0).all(axis=1)])

In [4]:
msBase = "/Users/kayaneh/Documents/deconvolution/molecstetho/remapped_unstranded/"
molecStethoCtsPath = msBase + "htseq_merged_unstrandedTS3_molestetho.csv"
cts = pd.read_csv(molecStethoCtsPath, sep = ",", index_col = [0, 1])

intronPath = msBase + "intron_exon_ratios_unstrandedTS3_molestetho.txt"
riboPath = msBase + "molecStetho_ribo_frac_unstrandedTS3.txt"
prime3Path = msBase + "deg_3prime_bias_frac_1_unstrandedTS3_molestetho.txt"

In [5]:
intronExon = pd.read_csv(intronPath, sep = "\t", index_col = 0).dropna()
riboFrac = pd.read_csv(riboPath, sep = ",", index_col = 0).dropna()
threePrimeBias = pd.read_csv(prime3Path, sep = "\t", index_col = 0).dropna()

In [6]:
passingQCCounts = sampleQC(cts, intronPath, riboPath, prime3Path)

In [7]:
goodSamps = passingQCCounts.columns.tolist()

In [8]:
# subset the counts to the good samples
cts = cts[goodSamps]

In [9]:
cts.shape

(60721, 195)

# now lets do this based on study

In [10]:
shuMeta = pd.read_excel("Ibarra et al NC Sample Code.xlsx",
                       sheet_name = None)

In [11]:
srrMeta = pd.read_csv("SraRunTable.txt.csv",
                     sep = ",", index_col = 0)
srrMeta["Run"] = srrMeta.index.tolist()

In [12]:
shuMeta.keys()

dict_keys(['Healthy cont (plasma vs buffy)', 'Healthy cont (transcript type)', 'AML-MM patient samples overview', 'MM longitudial', 'AML longitudial', 'GCSF', 'EPO'])

# EPO only

In [13]:
# these are the CKD samples
epo = shuMeta['EPO']
epo = epo.iloc[1:,:]
epo.columns = epo.iloc[0,:]
epo = epo.iloc[:,1:]
epo.set_index("Day", inplace = True)
epo = epo.iloc[1:,:]
epo

1,0,1,2,3,4,8,9,10,11,13,14,15,16,17,18,23,25,30,31,32
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
EPO1,3408_004-EPO1,3417_004-EPO1,,3446_004-EPO1,,,,3533_004-EPO1,,,,,,,,,,,,
EPO2,3492_004-EPO2,3502_004-EPO2,,,3523_004-EPO2,3558_004-EPO2,,,,,,,,,,,,,,
EPO3,3541_004-EPO3,3549_004-EPO3,,,3570_004-EPO3,3684_004-EPO3,,,,,,,,,,,,,,
EPO4,3693_004-EPO4,3702_004-EPO4,,3795_004-EPO4,,,3810_004-EPO4,,,,,,,,,,,,,
EPO5,6032_004-EPO5,6040_004-EPO5,,6081_004-EPO5,,,6300_004-EPO5,,,6551_004-EPO5,,6574_004-EPO5,,,,,,6952_004-EPO5,,
EPO6,6215_004-EPO6,6293_004-EPO6,,6417_004-EPO6,,,6584_004-EPO6,,,6867_004-EPO6,,,6879_004-EPO6,,,6949_004-EPO6,,6985_004-EPO6,,
EPO7,11954_004-EPO7,12072_004-EPO7,,12098_004-EPO7,,,,,,,12517_004-EPO7,,12570_004-EPO7,,,13073_004-EPO7,,13221_004-EPO7,,
EPO8,13340_004-EPO8,13421_004-EPO8,,,13449_004-EPO8,,,,13712_004-EPO8,,13742_004-EPO8,,,,13759_004-EPO8,,14186_004-EPO8,,,14270_004-EPO8
EPO9,14277_004-EPO9,14285_004-EPO9,,,14302_004-EPO9,,,14332_004-EPO9,,,,14434_004-EPO9,,14447_004-EPO9,,14476_004-EPO9,,,14552_004-EPO9,
Control 1,9495_010-Stability,9633_010-Stability,10045_010-Stability,10086_010-Stability,,,,,,,,,,,,,,,,


In [14]:
patient = []
sampName = []
day = []
for pat in epo.index:
    sampsThisEPO = epo.loc[pat].dropna().to_frame()
    day += sampsThisEPO.index.tolist()
    sampName += sampsThisEPO.T.values.tolist()[0]
    patient += [pat] * sampsThisEPO.shape[0]

In [15]:
epoDF = pd.DataFrame(index = ["Patient", "Sample Name", "Day"],
            data = [patient, sampName, day]).T

In [16]:
epoDF.head(3)

Unnamed: 0,Patient,Sample Name,Day
0,EPO1,3408_004-EPO1,0
1,EPO1,3417_004-EPO1,1
2,EPO1,3446_004-EPO1,3


In [17]:
srrID = []
for i in epoDF["Sample Name"]:
    srrID += srrMeta[srrMeta["Sample Name"] == i]["Run"].values.tolist()

In [18]:
epoDF["Run"] = srrID # add the SRA info to the data

subset the epoDF based on the samples passing QC

In [19]:
epoDF = epoDF[epoDF["Run"].isin(goodSamps)]

In [20]:
epoDF.head()

Unnamed: 0,Patient,Sample Name,Day,Run
0,EPO1,3408_004-EPO1,0,SRR8492548
1,EPO1,3417_004-EPO1,1,SRR8492550
2,EPO1,3446_004-EPO1,3,SRR8492552
3,EPO1,3533_004-EPO1,10,SRR8492555
4,EPO2,3492_004-EPO2,0,SRR8492551


In [21]:
epoDF.to_csv("/Users/kayaneh/Documents/deconvolution/molecstetho/remapped_unstranded/epoONLY_postQC_unstranded_intron3_09222021.csv",
            index = True, header = True)

In [22]:
epoCts_unstranded = cts[epoDF["Run"]]

In [23]:
epoCts_unstranded = epoCts_unstranded[epoCts_unstranded.any(axis = 1)]

In [24]:
epoCts_unstranded.shape

(24915, 60)

In [25]:
epoCts_unstranded.to_csv("remapped_unstranded/epoONLY_htseq-cts_unstrandedTS3_postQC.csv",
                       sep = ",")

In [26]:
clear all

[H[2J