In [1]:
import pandas as pd
import scanpy as sc
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns


hfont = {'fontname':'Helvetica'}
sns.set(font_scale = 2)
sns.set_style(style='white')


from scipy.stats import mannwhitneyu

np.random.seed(31)

use the correct samples off the EPO batches for TMM normalization.

In [2]:
basePath = "~/Documents/deconvolution/molecstetho/remapped_unstranded/"

In [3]:
allTMM = pd.read_csv(basePath + "tmmScalingFactors_epoONLY_unstranded_INTRON3_09222021.csv",
                    sep = ",", index_col = 0)
allTMM.head()

Unnamed: 0,tmmEPOApril,libsize
SRR8492548,1.446812,2841564
SRR8492550,1.405567,6834508
SRR8492552,1.184622,4136162
SRR8492555,1.354897,9592636
SRR8492551,0.909138,3274798


In [4]:
epoCts = pd.read_csv(basePath + "epoONLY_htseq-cts_unstrandedTS3_postQC.csv",
                         index_col = [0, 1])

In [5]:
"""
@param countsmat = counts matrix (for molecular stethoscope samples,
havent rejected bad samples, just specify good SRR later)
@param tmmFacs = dataframe of # samps x 2, where the cols are the TMM factors + libsize
@param tmmCol = column name (str) of tmmFacs DF 
@param mLplama = starting @ mL plasma used for extracting RNA


"""
def tmmScale(countsmat, tmmFacs, tmmCol, mLplasma):
    tmm = tmmFacs.T.loc[tmmCol]
    size = tmmFacs.T.loc["libsize"]
    scaleFac = tmm * size * mLplasma
    scaleFac.index = countsmat.columns.tolist()
    scaled = countsmat.div(scaleFac, axis = "columns") * 10 ** 6
    return(scaled)

In [6]:
allTMM.shape

(60, 2)

In [7]:
epoCts.shape

(24915, 60)

In [8]:
epoCts.columns == allTMM.index

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [9]:
epoAprilCPMTMM = tmmScale(epoCts, allTMM, allTMM.columns[0], 1 )

In [10]:
epoAprilCPMTMM.sum(axis = 0)

SRR8492548    6.911750e+05
SRR8492550    7.114568e+05
SRR8492552    8.441509e+05
SRR8492555    7.380635e+05
SRR8492551    1.099943e+06
SRR8492553    9.952244e+05
SRR8492554    1.098863e+06
SRR8492643    9.355186e+05
SRR8492827    1.014121e+06
SRR8492545    1.545405e+06
SRR8492641    9.796761e+05
SRR8492644    9.316248e+05
SRR8492642    1.314062e+06
SRR8492645    1.521567e+06
SRR8492646    1.125372e+06
SRR8492649    1.355992e+06
SRR8492647    1.062121e+06
SRR8492650    8.971893e+05
SRR8492648    9.774181e+05
SRR8492618    9.786870e+05
SRR8492625    9.378345e+05
SRR8492624    9.331958e+05
SRR8492722    8.937488e+05
SRR8492620    8.990335e+05
SRR8492622    7.546744e+05
SRR8492619    8.016886e+05
SRR8492621    8.855633e+05
SRR8492623    7.819344e+05
SRR8492627    7.715239e+05
SRR8492546    1.110388e+06
SRR8492723    8.740242e+05
SRR8492727    7.652012e+05
SRR8492660    8.293245e+05
SRR8492673    9.332645e+05
SRR8492692    1.084679e+06
SRR8492743    7.231140e+05
SRR8492789    7.073263e+05
S

In [11]:
def dropZeroGenes(df):
    return(df.loc[~(df==0).all(axis=1)])

In [12]:
epoCPMTMM = dropZeroGenes(epoAprilCPMTMM)

In [13]:
epoCPMTMM.sum(axis = 0)

SRR8492548    6.911750e+05
SRR8492550    7.114568e+05
SRR8492552    8.441509e+05
SRR8492555    7.380635e+05
SRR8492551    1.099943e+06
SRR8492553    9.952244e+05
SRR8492554    1.098863e+06
SRR8492643    9.355186e+05
SRR8492827    1.014121e+06
SRR8492545    1.545405e+06
SRR8492641    9.796761e+05
SRR8492644    9.316248e+05
SRR8492642    1.314062e+06
SRR8492645    1.521567e+06
SRR8492646    1.125372e+06
SRR8492649    1.355992e+06
SRR8492647    1.062121e+06
SRR8492650    8.971893e+05
SRR8492648    9.774181e+05
SRR8492618    9.786870e+05
SRR8492625    9.378345e+05
SRR8492624    9.331958e+05
SRR8492722    8.937488e+05
SRR8492620    8.990335e+05
SRR8492622    7.546744e+05
SRR8492619    8.016886e+05
SRR8492621    8.855633e+05
SRR8492623    7.819344e+05
SRR8492627    7.715239e+05
SRR8492546    1.110388e+06
SRR8492723    8.740242e+05
SRR8492727    7.652012e+05
SRR8492660    8.293245e+05
SRR8492673    9.332645e+05
SRR8492692    1.084679e+06
SRR8492743    7.231140e+05
SRR8492789    7.073263e+05
S

In [14]:
#epoAprilCPMTMM.to_csv("remapped_unstranded/epoAprilCPMTMM_08152021.csv", sep = ",", header = True)
epoCPMTMM.to_csv("remapped_unstranded/epoONLYCPMTMM_09222021_INTRON3.csv", sep = ",", header = True)

In [15]:
clear all

[H[2J