In [2]:
import pandas as pd
import numpy as np

In [3]:
import os

DATA_FOLDER = '../00_data'
FIGURE_FOLDER = '../04_figures'

notebook_name = '002_sum_counts_with_same_gene_symbol'

data_folder = os.path.join(DATA_FOLDER, notebook_name)
figure_folder = os.path.join(FIGURE_FOLDER, notebook_name)

print('Data folder for notebook:', data_folder)
print('Figure folder for notebook:', figure_folder)

! mkdir -p $figure_folder
! mkdir -p $data_folder

Data folder for notebook: ../00_data/002_sum_counts_with_same_gene_symbol
Figure folder for notebook: ../04_figures/002_sum_counts_with_same_gene_symbol


In [4]:
input_folder = '/Users/olgabot/'
! ls $input_folder

[1m[36mApplications[m[m                        [1m[36manaconda3[m[m
[1m[36mCreative Cloud Files[m[m                [1m[36mbin[m[m
[1m[36mDesktop[m[m                             [1m[36mcode[m[m
[1m[36mDocuments[m[m                           [1m[36mdocker[m[m
[1m[36mDownloads[m[m                           draft.txt
GSE24565_family.soft                fibroblast.txt
GSE24565_family.soft.gz             [1m[36mgdrive_test[m[m
[1m[36mGoogle Drive[m[m                        [1m[36mgenomes[m[m
Group1.matrix.txt                   [35mgoogledrive[m[m
Group2.matrix.txt                   java_error_in_pycharm_308.log
Group3.matrix.txt                   my_cool_file.txt
Group4.matrix.txt                   [1m[36mnotebooks[m[m
Group5.matrix.txt                   [1m[36molgabot.github.io-source-deprecated[m[m
Group8.matrix.txt                   [1m[36mprojects[m[m
[1m[36mLibrary[m[m                             [1m[36mp

In [5]:
group_numbers = 1, 2, 3, 4, 5, 8

In [6]:
%%time

for n in group_numbers:
    print(f'--- Group #{n} ---')
    group_txt = f'Group{n}.matrix.txt'
    filename = os.path.join(input_folder, group_txt)
    
    print('\t--- Time to read uncompressed csv: ---')
    %time df = pd.read_table(filename, header=None, index_col=[0, 1])
    
    # Sum counts for genes with the same gene symbol
    # Symbols are the second (1th) level of the rows (axis=0)
    print("Before after summing counts with same gene symbol", df.shape)
    print('\t--- Time to groupby and sum: ---')
    %time df = df.groupby(level=1, axis=0).sum()
    print("Before after summing counts with same gene symbol", df.shape)
    
    # Add "cNNNN" for the cell number
    df.columns = ['group{n}_{i}'.format(i=str(i).zfill(4), n=n) 
                  for i in range(len(df.columns))]
    
    csv = os.path.join(data_folder, group_txt.replace('.txt', '.csv.gz'))
    print('\t--- Time to write gzipped csv: ---')
    %time df.to_csv(csv, compression='gzip', index_label=False)
    print(f'\tWrote {csv}')
#     dfs.append(df)

# counts = pd.concat(dfs, axis=1)
# counts = counts.fillna(0)
# print(counts.shape)

# counts.head()

--- Group #1 ---
	--- Time to read uncompressed csv: ---
CPU times: user 5.7 s, sys: 705 ms, total: 6.41 s
Wall time: 6.42 s
Before after summing counts with same gene symbol (60725, 1000)
	--- Time to groupby and sum: ---
CPU times: user 3.14 s, sys: 2.22 s, total: 5.36 s
Wall time: 5.37 s
Before after summing counts with same gene symbol (58828, 1000)
	--- Time to write gzipped csv: ---
CPU times: user 58.2 s, sys: 119 ms, total: 58.3 s
Wall time: 58.4 s
	Wrote ../00_data/002_sum_counts_with_same_gene_symbol/Group1.matrix.csv.gz
--- Group #2 ---
	--- Time to read uncompressed csv: ---
CPU times: user 5.66 s, sys: 692 ms, total: 6.35 s
Wall time: 6.37 s
Before after summing counts with same gene symbol (60725, 1000)
	--- Time to groupby and sum: ---
CPU times: user 3.22 s, sys: 2.3 s, total: 5.52 s
Wall time: 5.53 s
Before after summing counts with same gene symbol (58828, 1000)
	--- Time to write gzipped csv: ---
CPU times: user 60 s, sys: 180 ms, total: 1min
Wall time: 1min
	Wrote .

In [7]:
ls -lha $data_folder

total 65616
drwxr-xr-x  8 olgabot  staff   272B Sep 29 14:27 [1m[36m.[m[m/
drwxr-xr-x  9 olgabot  staff   306B Sep 29 14:38 [1m[36m..[m[m/
-rw-r--r--  1 olgabot  staff   2.6M Sep 29 14:56 Group1.matrix.csv.gz
-rw-r--r--  1 olgabot  staff   2.6M Sep 29 14:57 Group2.matrix.csv.gz
-rw-r--r--  1 olgabot  staff    12M Sep 29 15:05 Group3.matrix.csv.gz
-rw-r--r--  1 olgabot  staff   8.0M Sep 29 15:10 Group4.matrix.csv.gz
-rw-r--r--  1 olgabot  staff   4.3M Sep 29 15:12 Group5.matrix.csv.gz
-rw-r--r--  1 olgabot  staff   2.0M Sep 29 15:13 Group8.matrix.csv.gz
