# getting a gene expression matrix from all samples

In [19]:
# load libraries
import pandas as pd
import numpy as np

import os

In [20]:
sample_info_file = pd.read_csv("gdc_sample_sheet.tsv", header=0, sep="\t")
exp_files = [i for i in os.listdir() if i.endswith("star_gene_counts.tsv") is True]

In [21]:
print(exp_files)
with open("file_order.txt", "w", encoding="utf-8", newline="\n") as tmpfile:
  tmpfile.write("\n".join(exp_files))

['00a26384-1b1c-4db4-9664-75fb9b3febdb.rna_seq.augmented_star_gene_counts.tsv', '01661d94-fc16-4456-95cf-a5fa4e1e196c.rna_seq.augmented_star_gene_counts.tsv', '01900aab-4c12-4198-b41f-ff638e5abe3b.rna_seq.augmented_star_gene_counts.tsv', '01ae5613-1c58-4d02-8fd3-47d35d71c22d.rna_seq.augmented_star_gene_counts.tsv', '01f6286b-8422-4f16-bec3-871c86396790.rna_seq.augmented_star_gene_counts.tsv', '021d9e06-9d27-400c-8776-08e89c817b46.rna_seq.augmented_star_gene_counts.tsv', '02423743-43b4-494b-acd9-1cd3bcd3d395.rna_seq.augmented_star_gene_counts.tsv', '02a4ccea-1464-4664-bc44-bb6569a34608.rna_seq.augmented_star_gene_counts.tsv', '02a50d1b-ff62-4936-a0ce-ac0e22a5b41c.rna_seq.augmented_star_gene_counts.tsv', '02bbe1b1-44ff-426b-a40e-47a01bc4cf7d.rna_seq.augmented_star_gene_counts.tsv', '032e5aa7-8028-48ad-8fe4-6a5cc6aee5cb.rna_seq.augmented_star_gene_counts.tsv', '034f8d01-f664-494b-bc04-77485000611f.rna_seq.augmented_star_gene_counts.tsv', '0361ec21-e83f-40a9-b825-d92487b1239a.rna_seq.augme

In [22]:
sample_info_file.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type
0,6abbf9ed-e3bd-489e-a67f-2624d427c81a,70f93c7b-27fa-47e4-a602-106bc35732d6.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-CSHL-0078-C25,HCM-CSHL-0078-C25-01A,Primary Tumor
1,1a2bf971-f722-403d-aa30-4882c164a300,7836ae1b-35ea-4273-b9ed-2bb72f1b3d06.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-CSHL-0246-C19,HCM-CSHL-0246-C19-01B,Primary Tumor
2,2290a1cd-7183-46d6-a509-37fa7d8b19a9,9d1c711e-5a41-48c0-af11-ef2e94c0b1e3.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0110-C25,HCM-BROD-0110-C25-01A,Primary Tumor
3,0e95c8a8-8bc3-4016-be71-acdc9411a369,24ee1455-865e-4bf0-a7ce-de2d0de3465f.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-CSHL-0238-C18,HCM-CSHL-0238-C18-01B,Primary Tumor
4,f24cdde4-9bd2-4cd8-ab7f-7d20776f063e,e453ca55-fabb-4b24-87b9-364dae2beda9.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-CSHL-0058-C34,HCM-CSHL-0058-C34-01B,Primary Tumor


From sample info file, use sample id as identifier for each sample

In [29]:
def getCounts(filename, sample_info, expressionFiles):
  df = pd.read_csv(filename, header=1, sep = "\t")
  # print(filename)
  # print(df.head())
  geneids = df["gene_id"][4:].values
  genenames = df["gene_name"][4:].values
  if "unstranded" in df.columns:
    tpmvals = df["unstranded"][4:].values
  # print(f"{tpmvals[0:5]}\t{geneids[0:5]}")
    sampleName = filename.split(".")[0]
  # print(sampleName)
    counts_series = pd.Series(tpmvals, index=genenames)
    return counts_series, sampleName
  else:
    return f"column not found for {filename}"

In [30]:
dfKeys = []
dfdata = []
for fileid in exp_files:
  counts, sampleid = getCounts(filename=fileid, sample_info=sample_info_file, expressionFiles=exp_files)
  dfKeys.append(sampleid)
  dfdata.append(counts)
  
print(len(dfKeys), len(dfdata))


1223 1223


In [31]:
df_dict = {sample:data for sample,data in zip(dfKeys, dfdata)}

In [32]:
final_df = pd.DataFrame(data=df_dict)
print(final_df.head())

          00a26384-1b1c-4db4-9664-75fb9b3febdb  \
TSPAN6                                    3888   
TNMD                                         9   
DPM1                                      1458   
SCYL3                                     2135   
C1orf112                                   440   

          01661d94-fc16-4456-95cf-a5fa4e1e196c  \
TSPAN6                                    1372   
TNMD                                        27   
DPM1                                      2135   
SCYL3                                     2385   
C1orf112                                  1075   

          01900aab-4c12-4198-b41f-ff638e5abe3b  \
TSPAN6                                    4914   
TNMD                                        19   
DPM1                                      1507   
SCYL3                                     1404   
C1orf112                                   348   

          01ae5613-1c58-4d02-8fd3-47d35d71c22d  \
TSPAN6                                    5055 

In [33]:
final_df.to_csv("raw_counts.tsv", sep = "\t", index=True)