In [1]:
import pandas as pd
import numpy as np

# gene_FPKM_200501.csv

In [2]:
# data = pd.read_csv("data/data_old/original_files/gene_FPKM_200501.csv")
data = pd.read_csv("data/data_old/original_files/sample_10k_c5k_gene_FPKM_200501.csv")

In [6]:
data = data.set_index('Unnamed: 0')
data.head()

Unnamed: 0_level_0,GSM1717005,GSM3392879,SRX1977870,GSM2090494,GSM2388462,GSM2731794,GSM2520927,SRX3177096,SRX4170900,SRX2771292,...,GSM2790181,SRX349077,SRX4803567,SRX4171194,GSM3573039,SRX6381818,SRX4170867,SRX7581663,GSM3638267,SRX6382439
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35317,0.382268,0.099114,0.337159,0.136644,1.258159,0.151155,0.08304,0.363878,0.0,1.76451,...,0.0,0.0,0.256079,0.0,2.894369,0.0,0.205115,0.251151,0.13467,0.0
5062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.186742,0.0,0.0,0.0,0.0,0.0
6600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.884976,0.0,0.0,0.0,0.0,0.0
33068,0.177099,2.673183,0.090059,2.281365,1.59074,0.894481,13.338754,1.448938,4.73042,1.726705,...,0.0,2.281294,1.910024,0.570215,5.162827,0.0,1.20894,3.306149,0.133564,0.0


1. Eliminar datos de genes que tienen expresión 0 en algunas condiciones. Si lo que queremos es ver genes que tienen una expresión estable, parece lógico eliminar los genes que no se expresan en algunos experimentos. Se podría mirar cuántos genes quedan si se quitan los que tienen expresión 0 en 1 experimento, o en el 10% de los experimentos, por ejemplo. Había muchos genes que tenían expresión 0 en la mayoría de experimentos (en el summary de la primera base de datos, de los 27420 genes que expresan proteinas, 7528 tienen una mediana de expresión <0.001).


In [100]:
def remove_low_expression_genes(data, pct_0=0.1):

    ncol_0  = np.round(pct_0 * data.shape[1])
    rows_in = np.apply_along_axis(lambda x: np.sum(x == 0), 
                                  axis = 1, arr = data) < ncol_0
    

    return data.loc[rows_in]

In [101]:
data_red = remove_low_expression_genes(data, pct_0=0.1)

2. Una vez quitados estos genes, se podría re-escalar los datos de expresión de 0 a 100 para cada experimento (reescalar en cada columna del dataset inicial). En realidad, los valores "absolutos" de expresión no nos interesan tanto como saber si un gen es de los más expresados, o está en el 10% de los menos expresados, por ejemplo. Esto en principio podría disminuir la variación, aunque con la conversión a unidades TPM ya estarían normalizados de alguna forma.

In [102]:
def scale01(data):
    return data.apply(lambda x: (x - x.min())
                      / (x.max() - x.min()), axis = 0 )

In [103]:
data_red = scale01(data_red)

3. Hacer una resumen de los datos de nuevo para ver cómo quedaría la variación, y si se ven genes más "estables".

In [104]:
def get_statistics(data):

    # Get statistics
    mean_df = data.mean(axis=1)
    median_df = data.median(axis=1)
    std_df = data.std(axis=1)
    cv_df = data.apply(lambda x: np.std(x, ddof=1) / np.mean(x), axis=1)
    summ_df = pd.concat([mean_df, median_df, std_df, cv_df], axis=1)
    summ_df.columns = ["mean", "median", "std", "cv"]
    summ_df.index = data.index

    #print("Writing Data")
    #summ_df.to_csv("data/summmary_statistics.csv")
    return summ_df

In [106]:
statistics = get_statistics(data_red)
statistics

Unnamed: 0_level_0,mean,median,std,cv
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35570,0.255306,0.273934,0.156278,0.612120
1151,0.088086,0.068795,0.082877,0.940856
22692,0.212381,0.170142,0.185451,0.873199
13060,0.307822,0.319374,0.192556,0.625545
18597,0.190088,0.180518,0.139505,0.733900
...,...,...,...,...
31884,0.183355,0.184873,0.129358,0.705504
13825,0.116759,0.093619,0.105719,0.905449
27251,0.109035,0.097984,0.090973,0.834354
37017,0.071182,0.056109,0.064362,0.904187


In [124]:
def get_top_genes(data, pct=0.1):
    nrows_in = int(np.round(data.shape[0] * pct))
    top_genes = data.sort_values("mean", ascending=False)
    return top_genes.iloc[:nrows_in]

In [125]:
get_top_genes(statistics, pct=0.1)

Unnamed: 0_level_0,mean,median,std,cv
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
36241,0.691636,0.779047,0.287419,0.415564
18213,0.682141,0.768845,0.252036,0.369478
2742,0.608378,0.735138,0.356496,0.585978
28966,0.601635,0.716117,0.315639,0.524635
19416,0.585354,0.693266,0.309522,0.528777
...,...,...,...,...
18870,0.338778,0.374739,0.193421,0.570936
23074,0.338139,0.341473,0.233532,0.690639
5913,0.337462,0.361994,0.193766,0.574186
17726,0.336891,0.357128,0.205740,0.610700


# Ath-r.c5-0.expression.combat.txt

Expression in log2 TPM units. Back to original scale before filtering 0s

In [126]:
data = pd.read_csv("data/data_atted/original_files/Ath-r.c5-0.expression.combat.txt", sep = "\t")

In [143]:
data.shape

(18957, 14741)

In [142]:
data_red = remove_low_expression_genes(data, pct_0=0.1)

In [145]:
data_red = scale01(data_red)

In [146]:
statistics = get_statistics(data_red)
statistics

Unnamed: 0,mean,median,std,cv
10723023,0.462353,0.462541,0.072358,0.156499
10723027,0.448918,0.448529,0.070923,0.157987
10723028,0.529903,0.531529,0.060460,0.114095
10723029,0.591566,0.606001,0.098777,0.166975
10723033,0.500315,0.501156,0.065030,0.129977
...,...,...,...,...
844435,0.622036,0.624276,0.055341,0.088967
844436,0.472417,0.477188,0.080805,0.171045
844438,0.337682,0.327379,0.092421,0.273692
844441,0.426548,0.428673,0.079001,0.185211


In [147]:
get_top_genes(statistics, pct=0.1)

Unnamed: 0,mean,median,std,cv
843029,0.943741,0.966664,0.068835,0.072939
28718911,0.942447,0.958676,0.067144,0.071244
839871,0.922263,0.943659,0.074827,0.081134
818558,0.915098,0.931271,0.072935,0.079702
823901,0.905412,0.928084,0.077927,0.086068
...,...,...,...,...
836699,0.684828,0.687875,0.056486,0.082482
837180,0.684801,0.689866,0.054919,0.080196
814670,0.684750,0.691973,0.059822,0.087363
824694,0.684744,0.686729,0.057481,0.083946
