In [1]:
def ConvertCsvToHdf5(filename,new_filename):
    
    # Make sure that the new_filename includes .hdf5 at the end
    
    # load file and count nrows & ncols
    sample = pd.read_csv(filename);
    sample = sample.drop(['Unnamed: 0'], 1); # don't use this for s1 or s2
    nrows = len(sample)
    ncols = np.size(sample,1)
    
    # Retrieve Barcodes (row titles)
    barcodes = sample.loc[:,'barcode'].tolist()
    
    # Reformat Barcodes
    ascii_barcodes = [n.encode("ascii", "ignore") for n in barcodes];
    
    # Retrieve Transcripts (col titles)
    transcripts = list(sample.columns)
    transcripts.pop(0) # remove 'barcodes' from transcript list
    
    # Reformt Transcripts List
    ascii_transcripts = [n.encode("ascii", "ignore") for n in transcripts];
    
    # Reformat Counts Matrix
    counts_list = sample.values.tolist(); # convert to list
    counts_array = np.array(counts_list); # convert to np.array
    counts_array_reduced = counts_array[:,1:]; # trim first column
    final_counts = np.int64(counts_array_reduced); # convert to int64 datatype
    
    # ReWrite into HDF5 File
    
    f = h5py.File(new_filename, "w")
    dset1 = f.create_dataset("Barcodes", (nrows,), dtype='a16', chunks=True)
    dset2 = f.create_dataset("Top_Counts", (nrows,ncols-1), dtype='int64', chunks=True)
    dset3 = f.create_dataset("Transcripts", (ncols-1,), dtype='a24', chunks=True)
    
    dset1[...] = ascii_barcodes;
    dset2[...] = final_counts;
    dset3[...] = ascii_transcripts;
    
    f.close()

    return [nrows, ncols] 


In [2]:
import h5py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap.umap_ as umap
from pydpc import Cluster
%matplotlib inline
import scipy.io
import sys
import copy
from collections import Counter


In [4]:
Working_Directory = '' #directory
Post_Inflection_Gating = '' #directory
Filename = '/s1_inflection_gated.csv'

file_local = Working_Directory+Post_Inflection_Gating+Filename;

In [14]:
sample = pd.read_csv(file_local, index_col = 0);
type(sample)

pandas.core.frame.DataFrame

In [15]:
sample


Unnamed: 0,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010F05Rik,0610010K14Rik,0610011F06Rik,0610012D04Rik,...,snoU18,snoU2-30,snoU2_19,snoU83B,snoZ196,snoZ278,snoZ40,snoZ6,snosnR66,uc_338
bcEEHH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bcAPUN,0,0,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
bcHCQI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bcEFUV,0,0,0,0,1,0,2,0,3,0,...,0,0,0,0,0,0,0,0,0,0
bcDWMB,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bcFKBI,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bcDYXS,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
bcDXKU,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bcHSSI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bcDEZE,0,0,2,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
barcodes = sample.index.tolist()
transcripts = list(sample.columns)

In [21]:
sample.transpose()

Unnamed: 0,bcEEHH,bcAPUN,bcHCQI,bcEFUV,bcDWMB,bcFKBI,bcDYXS,bcDXKU,bcHSSI,bcDEZE,...,bcHVUH,bcGPWG,bcARGE,bcHRXL,bcHEHC,bcALUL,bcBQLJ,bcBOPH,bcARYQ,bcBQNR
0610006L08Rik,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
0610009B22Rik,0,2,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
0610009E02Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009L18Rik,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009O20Rik,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0610010F05Rik,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610010K14Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610011F06Rik,0,0,0,3,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
0610012D04Rik,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
sample = sample.transpose()

In [34]:
sample.to_csv('sample1.txt')

In [24]:
sample

Unnamed: 0,bcEEHH,bcAPUN,bcHCQI,bcEFUV,bcDWMB,bcFKBI,bcDYXS,bcDXKU,bcHSSI,bcDEZE,...,bcHVUH,bcGPWG,bcARGE,bcHRXL,bcHEHC,bcALUL,bcBQLJ,bcBOPH,bcARYQ,bcBQNR
0610006L08Rik,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
0610009B22Rik,0,2,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
0610009E02Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009L18Rik,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009O20Rik,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0610010F05Rik,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610010K14Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610011F06Rik,0,0,0,3,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
0610012D04Rik,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
np.savetxt('sample1-1.csv', sample.values, delimiter=',')

In [31]:
with open('genes.tsv', 'w') as file_handler:
    file_handler.write("\n".join(str(item) for item in transcripts))

In [32]:
with open('barcodes.tsv', 'w') as file_handler:
    file_handler.write("\n".join(str(item) for item in barcodes))

In [46]:
Working_Directory = '' #directory
Post_Inflection_Gating = '' #directory
Filename = '/s6_doublet_removal.csv'
name = Working_Directory+Post_Inflection_Gating+Filename;

sample = pd.read_csv(name, index_col = 0);

sample = sample.transpose()
sample.to_csv('s6_transpose.csv')