In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
#unfiltered gene counts table
raw_counts = pd.read_csv('Unfiltered Gene Counts Table Joined.tsv', sep='\t')
raw_counts.head()

Unnamed: 0,Gene_ID,Ctrl_F1,Ctrl_F2,AIS_M1,Ctrl_M1,AIS_M2,Ctrl_M2,Ctrl_M3,AIS_M3,AIS_M4,...,Ctrl_M5,AIS_F1,Ctrl_F5,Ctrl_F6,AIS_F2,AIS_F3,AIS_F4,AIS_F5,AIS_F6,Ctrl_F7
0,ENSG00000082929,139.0,110.0,7.0,219.0,28.0,60.0,12.0,251.0,4.0,...,70.0,13.0,33.0,14.0,503.0,179.0,22.0,153.0,123.0,5.0
1,ENSG00000115934,1.0,1.0,3.0,2.0,6.0,1.0,3.0,178.0,3.0,...,0.0,10.0,11.0,9.0,464.0,5.0,2.0,2.0,218.0,0.0
2,ENSG00000122043,8.0,11.0,10.0,971.0,47.0,13.0,19.0,21.0,172.0,...,59.0,266.0,57.0,22.0,32.0,10.0,421.0,510.0,255.0,12.0
3,ENSG00000122548,421.0,4.0,187.0,16.0,561.0,1.0,227.0,186.0,3.0,...,19.0,15.0,8.0,8.0,14.0,215.0,34.0,167.0,0.0,263.0
4,ENSG00000124915,4.0,24.0,206.0,1010.0,403.0,138.0,7.0,21.0,187.0,...,54.0,23.0,1012.0,675.0,14.0,10.0,319.0,395.0,88.0,4.0


In [42]:
#set gene id as index and drop genes with zero count (for calculating geometric mean)
counts = raw_counts.loc[~(raw_counts == 0).any(axis=1)].iloc[:-1].set_index('Gene_ID')
counts.head()

Unnamed: 0_level_0,Ctrl_F1,Ctrl_F2,AIS_M1,Ctrl_M1,AIS_M2,Ctrl_M2,Ctrl_M3,AIS_M3,AIS_M4,AIS_M5,...,Ctrl_M5,AIS_F1,Ctrl_F5,Ctrl_F6,AIS_F2,AIS_F3,AIS_F4,AIS_F5,AIS_F6,Ctrl_F7
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000082929,139.0,110.0,7.0,219.0,28.0,60.0,12.0,251.0,4.0,284.0,...,70.0,13.0,33.0,14.0,503.0,179.0,22.0,153.0,123.0,5.0
ENSG00000122043,8.0,11.0,10.0,971.0,47.0,13.0,19.0,21.0,172.0,273.0,...,59.0,266.0,57.0,22.0,32.0,10.0,421.0,510.0,255.0,12.0
ENSG00000124915,4.0,24.0,206.0,1010.0,403.0,138.0,7.0,21.0,187.0,13.0,...,54.0,23.0,1012.0,675.0,14.0,10.0,319.0,395.0,88.0,4.0
ENSG00000125514,270.0,289.0,399.0,368.0,468.0,272.0,491.0,707.0,88.0,669.0,...,126.0,594.0,383.0,557.0,662.0,320.0,68.0,18.0,482.0,178.0
ENSG00000125804,10.0,250.0,206.0,886.0,32.0,613.0,1548.5,25.0,37.0,832.0,...,87.0,1360.0,1756.0,764.0,588.0,241.0,1074.0,268.0,139.0,131.0


In [43]:
#add pseudo reference sample (row wise geometric mean)
counts['pseudo_reference_sample'] = counts.apply(lambda x: np.exp(np.log(x).mean()), axis=1)
counts.head()

Unnamed: 0_level_0,Ctrl_F1,Ctrl_F2,AIS_M1,Ctrl_M1,AIS_M2,Ctrl_M2,Ctrl_M3,AIS_M3,AIS_M4,AIS_M5,...,AIS_F1,Ctrl_F5,Ctrl_F6,AIS_F2,AIS_F3,AIS_F4,AIS_F5,AIS_F6,Ctrl_F7,pseudo_reference_sample
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000082929,139.0,110.0,7.0,219.0,28.0,60.0,12.0,251.0,4.0,284.0,...,13.0,33.0,14.0,503.0,179.0,22.0,153.0,123.0,5.0,75.445251
ENSG00000122043,8.0,11.0,10.0,971.0,47.0,13.0,19.0,21.0,172.0,273.0,...,266.0,57.0,22.0,32.0,10.0,421.0,510.0,255.0,12.0,63.951647
ENSG00000124915,4.0,24.0,206.0,1010.0,403.0,138.0,7.0,21.0,187.0,13.0,...,23.0,1012.0,675.0,14.0,10.0,319.0,395.0,88.0,4.0,105.044315
ENSG00000125514,270.0,289.0,399.0,368.0,468.0,272.0,491.0,707.0,88.0,669.0,...,594.0,383.0,557.0,662.0,320.0,68.0,18.0,482.0,178.0,208.74247
ENSG00000125804,10.0,250.0,206.0,886.0,32.0,613.0,1548.5,25.0,37.0,832.0,...,1360.0,1756.0,764.0,588.0,241.0,1074.0,268.0,139.0,131.0,279.798042


In [48]:
#calulate ratio of each sample to reference
ratios = counts.div(counts['pseudo_reference_sample'], axis=0).iloc[:, :-1]
ratios.head()

Unnamed: 0_level_0,Ctrl_F1,Ctrl_F2,AIS_M1,Ctrl_M1,AIS_M2,Ctrl_M2,Ctrl_M3,AIS_M3,AIS_M4,AIS_M5,...,Ctrl_M5,AIS_F1,Ctrl_F5,Ctrl_F6,AIS_F2,AIS_F3,AIS_F4,AIS_F5,AIS_F6,Ctrl_F7
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000082929,1.842396,1.458011,0.092783,2.902767,0.37113,0.795279,0.159056,3.326916,0.053019,3.764319,...,0.927825,0.17231,0.437403,0.185565,6.667086,2.372581,0.291602,2.027961,1.630321,0.066273
ENSG00000122043,0.125095,0.172005,0.156368,15.183346,0.73493,0.203279,0.297099,0.328373,2.689532,4.26885,...,0.922572,4.159392,0.891298,0.34401,0.500378,0.156368,6.583099,7.974775,3.987388,0.187642
ENSG00000124915,0.038079,0.228475,1.961077,9.61499,3.836476,1.313731,0.066639,0.199916,1.780201,0.123757,...,0.514069,0.218955,9.634029,6.425859,0.133277,0.095198,3.036814,3.760318,0.837742,0.038079
ENSG00000125514,1.29346,1.384481,1.911446,1.762938,2.241997,1.303041,2.352181,3.386949,0.421572,3.204906,...,0.603615,2.845612,1.834797,2.66836,3.171372,1.532989,0.32576,0.086231,2.309065,0.852725
ENSG00000125804,0.03574,0.893502,0.736245,3.16657,0.114368,2.190866,5.534349,0.08935,0.132238,2.973573,...,0.310939,4.860649,6.275955,2.730541,2.101516,0.861336,3.838483,0.957834,0.496787,0.468195


In [49]:
#find the median of ratio for each sample to use as normalization factor
medians = ratios.median(axis=0)
medians

Ctrl_F1    0.645953
Ctrl_F2    0.807942
AIS_M1     0.784621
Ctrl_M1    1.812835
AIS_M2     1.600587
Ctrl_M2    0.786983
Ctrl_M3    1.315733
AIS_M3     1.413179
AIS_M4     0.851589
AIS_M5     1.332906
AIS_M6     0.642996
Ctrl_M4    1.178797
AIS_M7     0.796924
Ctrl_F3    2.369873
AIS_M8     1.197753
AIS_M9     1.499815
Ctrl_F4    1.709899
AIS_M10    1.270794
Ctrl_M5    0.399699
AIS_F1     1.609493
Ctrl_F5    1.859238
Ctrl_F6    1.299120
AIS_F2     1.258875
AIS_F3     0.813604
AIS_F4     1.278082
AIS_F5     2.252808
AIS_F6     0.824923
Ctrl_F7    0.511354
dtype: float64

In [50]:
#divide counts by normalization factor
normalized_counts = counts.div(medians, axis=1)
normalized_counts.head()

Unnamed: 0_level_0,AIS_F1,AIS_F2,AIS_F3,AIS_F4,AIS_F5,AIS_F6,AIS_M1,AIS_M10,AIS_M2,AIS_M3,...,Ctrl_F4,Ctrl_F5,Ctrl_F6,Ctrl_F7,Ctrl_M1,Ctrl_M2,Ctrl_M3,Ctrl_M4,Ctrl_M5,pseudo_reference_sample
Gene_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000082929,8.077076,399.562944,220.008617,17.213296,67.915247,149.104875,8.921504,485.523159,17.493582,177.613778,...,14.620747,17.749208,10.776524,9.777955,120.805253,76.240528,9.120391,278.249797,175.131985,
ENSG00000122043,165.269401,25.419511,12.290984,329.399892,226.384158,309.119864,12.745006,17.312009,29.364227,14.860117,...,394.760176,30.657723,16.934537,23.467093,535.625118,16.518781,14.440619,13.573161,147.611244,
ENSG00000124915,14.290211,11.121036,12.290984,249.592792,175.33675,106.676659,262.547121,297.451789,251.782626,14.860117,...,175.448967,544.309054,519.582396,7.822364,557.138382,175.353213,5.320228,504.751919,135.101817,
ENSG00000125514,369.060241,525.866141,393.311494,53.204733,7.990029,584.297154,508.525734,11.803642,292.392726,500.2906,...,230.422977,205.998387,428.751696,348.095209,202.996955,345.623725,373.176007,166.27122,315.237572,
ENSG00000125804,844.986411,467.083521,296.212719,840.321814,118.962656,168.500632,262.547121,166.824813,19.992665,17.690615,...,390.666367,944.473023,588.090297,256.182429,488.737234,778.924056,1176.910483,1236.005958,217.664038,
