# **COUNT MATRIX CREATION**
## Alignment files are concatenated into a matrix

In [1]:
import os
import numpy as np
from numpy import isneginf
import pandas as pd
import glob
import sys
import csv
import re

## Load metadata

In [2]:
meta=pd.read_csv('data/gastruloid_bulk_metadata.csv')
meta.head()

Unnamed: 0,Sample,Condition,Replicate,Filename_ID
0,0h_1,0h,1,A1_S2
1,0h_2,0h,2,A2_S8
2,0h_3,0h,3,A3_S9
3,0h-Epi_1,0h-Epi,1,A5_S29
4,0h-Epi_2,0h-Epi,2,A6_S30


## List gene counts files

In [3]:
path = 'star_out'

files = glob.glob(os.path.join(path , "*_S*/*_ReadsPerGene.out.tab"))

In [5]:
# retain only files specified in metadata
files=[f for f in files if f.split('/')[1] in meta['Filename_ID'].str.split('_R1').str[0].values]

## Sort files according to metadata

In [7]:
# set order using metadata
order=[x.split('_R1')[0] for x in meta['Filename_ID']]
# sort files
sorted_files=sorted(files, key=lambda x: order.index(x.split('/')[1]))

## Countmatrix: read and concatenate

In [8]:
dfs = []
samples =[]

for filename in sorted_files:
    df = pd.read_csv(filename, index_col=0, header=0, sep='\t', skiprows=3)
    df = df.iloc[:,0] # first column belongs to Unstranded reads (sequencing specififcation)

    dfs.append(df)
    
# concat dfs
counts = pd.concat(dfs, axis=1, ignore_index=True)
# remove genes with 0 counts accross all genes
counts = counts.loc[~(counts==0).all(axis=1)]
# rename columns and index
counts.columns = meta['Sample']
counts.index.name = 'Gene'
counts = counts.fillna(0)

# save df
counts.to_csv(path + '/gastruloid_bulk.csv')
counts.head()

Sample,0h_1,0h_2,0h_3,0h-Epi_1,0h-Epi_2,0h-Epi_3,48h_1,48h_2,48h_3,48h-AF_1,...,120h-high-ActA_1,120h-high-ActA_2,120h-high-ActA_3,144h-low-ActA_1,144h-low-ActA_2,144h-low-ActA_3,144h-high-ActA_1,144h-high-ActA_2,144h-high-ActA_3,96h-3μMc-DMSO_1
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,0,3,0,38,55,41,9,6,17,16,...,27,43,19,5,0,9,21,7,16,46
Gm18956,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,3,0,0
Gm37180,0,0,0,4,3,1,2,0,2,7,...,9,5,0,0,2,0,1,0,0,6
Gm37363,0,4,0,11,22,1,0,1,8,9,...,11,11,11,0,5,4,0,0,5,11
Gm37686,0,0,7,0,9,0,2,1,3,0,...,0,0,3,0,0,0,0,3,5,5
