# **COUNT MATRIX CREATION**
## Alignment files are concatenated into a matrix

In [1]:
import os
import numpy as np
import pandas as pd
import glob
import sys
import csv
import matplotlib.pyplot as plt
import seaborn as sns

Matplotlib created a temporary cache directory at /tmp/jobs/ppascual/109233/matplotlib-e6a3c3ss because the default path (/home/jovyan/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


## Load metadata

## List gene counts files

In [2]:
path = 'star_out/counts'

files = glob.glob(os.path.join(path , "*_counts.tab"))

In [3]:
sorted_files=sorted(files)

In [4]:
# retain only files specified in metadata
#files=[f for f in files if f.split('/')[1] in meta['Filename_ID'].str.split('_R1').str[0].values]

## Countmatrix: read and concatenate

In [5]:
dfs = []
samples =[]

for filename in sorted_files:
    df = pd.read_csv(filename, index_col=0, header=0, sep='\t', skiprows=3)
    df = df.iloc[:,0] # first column belongs to Unstranded reads (sequencing specififcation)
    sample=filename.split('/')[-1].split('_counts')[0]

    dfs.append(df)
    samples.append(sample)
    
# concat dfs
counts = pd.concat(dfs, axis=1, ignore_index=True)
# remove genes with 0 counts accross all genes
# counts = counts.loc[~(counts==0).all(axis=1)]
# rename columns and index
counts.columns = samples
counts.index.name = 'Gene'
counts = counts.fillna(0)

# Specify the new column order
idxs= ['WT0002', 'WT0003','WT0004'] + [col for col in counts.columns if col not in ['WT0002', 'WT0003','WT0004']]
counts=counts[idxs]

# save df
counts.to_csv(path + '/cell_comp_bulk.csv')
counts.head()

Unnamed: 0_level_0,WT0002,WT0003,WT0004,F0002,F0003,F0004,K0002,K0003,K0004,KF0482,...,WW0724,WW0962,WW0963,WW0964,WW1202,WW1203,WW1204,WW1442,WW1443,WW1444
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4933401J01Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm26206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Xkr4,4,9,7,9,11,4,11,13,14,90,...,331,145,129,113,21,35,22,11,10,14
Gm18956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm37180,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [7]:
counts.sum().sum()

np.int64(3695039036)

In [6]:
meta=pd.DataFrame(index=counts.columns.values)

# Add the 'condition' column by removing the last character of each index
meta['Condition'] = [idx[:-1] for idx in meta.index]

meta.to_csv(path+'/metadata_cell_comp.csv')