In [14]:
fname = "/Users/gc3045/laml2_experiments/tlsdata/TLS_097_unfiltered_cm_edited.txt"

Write into json format, with the following as an example: 

```
{
	“cell_BC”: “AAACCCACACTACCGG-1",
	“cassettes”: [
	{
		“cassette_id”: 0,
		“int_BC”: “AACTATTACATATA”,
		“count_data”: [
		{
			“allele”: [None][None][None],
			“r1”: 0,
			“r2": 0,
			“r3”: 0,
			“umi_count”: 183,
			“read_count”: 1297
		}]
	},
	{
		“cassette_id”: 1,
		“int_BC”: “ACTTCTGTAATGTG”,
		“count_data”: [
		{
			“allele”: [None][None][None],
			“r1”: 0,
			“r2": 0,
			“r3”: 0,
			“umi_count”: 125,
			“read_count”: 855
		}]
	},
		…..,		
	{
		“cassette_id”: 9,
		“int_BC”: ???,
		“count_data”: []. # Missing data
	}]
}
{
	“cell_BC”: “AAACCCAGTAGCACGA-1",
	“cassettes”: [
	{
		“cassette_id”: 0,
		“int_BC”: “AACTATTACATATA”,
		“count_data”: [
		{
			“allele”: [None][None][None],
			“r1”:  0,
			“r2": 0,
			“r3”: 0,
			“umi_count”: 179,
			“read_count”: 1193
		}]
	},
	{
		“cassette_id”: 1,
		“int_BC”: “ACTTCTGTAATGTG”,
		“count_data”: [
		{
			“allele”: [None][None][None],
			“r1”: 0,
			“r2": 0,
			“r3”: 0,
			“umi_count”: 138,
			“read_count”: 900
		}]
	},
		…..,		
	{
		“cassette_id”: 9,
		“int_BC”: ???,
		“count_data”: []. # Missing data
	}]
}
```

In [60]:
# write into json format
# fin = open(fname, 'r')
# site_names = fin.readline().strip().split(',')

In [16]:
import csv
import numpy as np
import json
import ast
import os

In [17]:
def check_and_ensure_brackets(s):
    if s.startswith('[') and s.endswith(']'):
        return s, True
    else:
        modified_s = '[' + s.strip('[]') + ']'
        return modified_s, False


In [22]:
fin = open(fname, 'r')
reader = csv.reader(fin, delimiter=',')

result_json = {'cell_data': []}
for r, row in enumerate(reader):
    if r == 0:
        header = row
    else: 
        # print(row)
        cell_name = row[0]
        cassettes = row[1:]
        
        cell_json = {'cell_name': cell_name, 'cassettes': []}
        # print("cassettes", cassettes)
        
        
        for cassette_id, cassette_dict in enumerate(cassettes):
            cassette_dict = check_and_ensure_brackets(cassette_dict)[0]
            # print("cassette_dict", cassette_dict)
            
            count_data_list = [] # holds state and counts
            cassette_state_list = ast.literal_eval(cassette_dict)
            
            for entry in cassette_state_list:
                # print("entry", entry)
                cassette_state, counts = entry
                count_data_dict = {}
                # NOTE: tuples don't exist in json!
                count_data_dict['allele'] = tuple(cassette_state)
                # print('cassette_state', cassette_state)
                for target_site_idx, target_site_state in enumerate(cassette_state):
                    count_data_dict[f'site_{target_site_idx}'] = target_site_state
                count_data_dict['umi_count'] = counts
            
                count_data_list.append(count_data_dict)
            
            cassette_json = {"cassette_id": cassette_id,
                             "count_data": count_data_list 
                            }
            cell_json['cassettes'].append(cassette_json)
        result_json['cell_data'].append(cell_json)
#     if r > 5:
#        break

In [21]:
json_string = json.dumps(result_json, indent=4)
with open("TLS_097_reformatted.json", "w") as f:
    f.write(json_string)

In [27]:
test_json = result_json['cell_data'][0:5]
json_string = json.dumps(test_json, indent=4)
with open("/Users/gc3045/laml2_experiments/proc_realdata/TLS_097_test.json", "w") as f:
    f.write(json_string)

In [4]:
with open("/Users/gc3045/laml2_experiments/proc_realdata/TLS_097_reformatted.json", "r") as f:
    data = json.load(f)
    

In [5]:
len(data['cell_data'][0]['cassettes'])

10

In [6]:
[x['cassette_id'] for x in data['cell_data'][0]['cassettes']]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [7]:
data['cell_data'][0]['cassettes'][0]['count_data'][0]

{'allele': [1, 0, 0], 'site_0': 1, 'site_1': 0, 'site_2': 0, 'umi_count': 1}

In [8]:
site_keys = [key for key in data['cell_data'][0]['cassettes'][0]['count_data'][0].keys() if key.startswith('site_')]

site_keys

['site_0', 'site_1', 'site_2']

### check if it can be read into alleleTable

In [43]:
missing_char = -1

In [58]:
os.chdir('/Users/gc3045/scmail_v1/LAML/')
from laml_libs.Count_model.AlleleTable import AlleleTable
from laml_libs.Count_model.Alphabet import Alphabet

In [59]:
# data_struct: a mapping: cell_name -> (cassette -> (cassette_state -> count))
data_struct = {}
alphabet_dict = dict()

K = len([x['cassette_id'] for x in data['cell_data'][0]['cassettes']])
J = len([key for key in data['cell_data'][0]['cassettes'][0]['count_data'][0].keys() if key.startswith('site_')])
print("K:", K, "J:", J)

for cell_data in data['cell_data']:
    cell_name = cell_data['cell_name']
    # save into data_struct
    data_struct[cell_name] = []
    
    for cassette in cell_data['cassettes']:
        cassette_dict = dict()
        cassette_id = cassette['cassette_id']
        if cassette_id not in alphabet_dict:
            alphabet_dict[cassette_id] = dict()
            
        for cassette_entry in cassette['count_data']:
            cassette_state = tuple(cassette_entry['allele'])
            umi_count = cassette_entry['umi_count']
            
            for key in cassette_entry.keys():
                if key.startswith('site_'):
                    state = cassette_entry[key]
                    if key not in alphabet_dict[cassette_id]:
                        alphabet_dict[cassette_id][key] = {0, missing_char}
                    alphabet_dict[cassette_id][key].add(state)
            
            # print(cassette_state)
            # if missing data, report empty dictionary
            if cassette_state == tuple([missing_char for _ in range(J)]):
                pass
            else:
                cassette_dict[cassette_state] = umi_count
            
            # how do i handle missing data?
        data_struct[cell_name].append(cassette_dict)

    break

alphabet_data_struct = []
for cassette_index in alphabet_dict:
    new_cassette_list = []
    for site_index in range(J):
        new_target_site_list = list(alphabet_dict[cassette_index][f'site_{site_index}'])
        new_cassette_list.append(new_target_site_list)
    alphabet_data_struct.append(new_cassette_list)
    
# list of K cassettes
# each cassette is a list of J target sites
# each target site is a list of possible states at this target site


K: 10 J: 3


In [60]:
data_struct

{'AAACCCACACTACCGG-1': [{(1, 0, 0): 1,
   (2, 1, 0): 1,
   (0, 2, 0): 1,
   (0, 0, 0): 183},
  {(0, 0, 0): 125},
  {(1, 0, 0): 1, (2, 0, 0): 1, (0, 0, 1): 1, (0, 0, 0): 116},
  {(1, 1, 0): 1, (0, 0, 0): 125},
  {(1, 1, 1): 1, (0, 2, 0): 1, (0, 3, 0): 1, (0, 0, 0): 195},
  {(1, 1, 0): 1, (0, 1, 0): 201},
  {(1, 0, 0): 1, (0, 0, 0): 164},
  {},
  {},
  {}]}

In [61]:
alphabet_dict

{0: {'site_0': {-1, 0, 1, 2}, 'site_1': {-1, 0, 1, 2}, 'site_2': {-1, 0}},
 1: {'site_0': {-1, 0}, 'site_1': {-1, 0}, 'site_2': {-1, 0}},
 2: {'site_0': {-1, 0, 1, 2}, 'site_1': {-1, 0}, 'site_2': {-1, 0, 1}},
 3: {'site_0': {-1, 0, 1}, 'site_1': {-1, 0, 1}, 'site_2': {-1, 0}},
 4: {'site_0': {-1, 0, 1}, 'site_1': {-1, 0, 1, 2, 3}, 'site_2': {-1, 0, 1}},
 5: {'site_0': {-1, 0, 1}, 'site_1': {-1, 0, 1}, 'site_2': {-1, 0}},
 6: {'site_0': {-1, 0, 1}, 'site_1': {-1, 0}, 'site_2': {-1, 0}},
 7: {'site_0': {-1, 0}, 'site_1': {-1, 0}, 'site_2': {-1, 0}},
 8: {'site_0': {-1, 0}, 'site_1': {-1, 0}, 'site_2': {-1, 0}},
 9: {'site_0': {-1, 0}, 'site_1': {-1, 0}, 'site_2': {-1, 0}}}

In [62]:
alphabet_data_struct

[[[0, 1, 2, -1], [0, 1, 2, -1], [0, -1]],
 [[0, -1], [0, -1], [0, -1]],
 [[0, 1, 2, -1], [0, -1], [0, 1, -1]],
 [[0, 1, -1], [0, 1, -1], [0, -1]],
 [[0, 1, -1], [0, 1, 2, 3, -1], [0, 1, -1]],
 [[0, 1, -1], [0, 1, -1], [0, -1]],
 [[0, 1, -1], [0, -1], [0, -1]],
 [[0, -1], [0, -1], [0, -1]],
 [[0, -1], [0, -1], [0, -1]],
 [[0, -1], [0, -1], [0, -1]]]

In [63]:
alphabet = Alphabet(K,J,alphabet_data_struct)

In [64]:
alphabet

<laml_libs.Count_model.Alphabet.Alphabet at 0x12fd13190>

In [65]:
DLT_data = AlleleTable(data_struct,alphabet)

In [93]:
DLT_data.alphabet.get_M(1)

8

In [81]:
DLT_data.data_struct['AAACCCACACTACCGG-1']

[{(1, 0, 0): 1, (2, 1, 0): 1, (0, 2, 0): 1, (0, 0, 0): 183},
 {(0, 0, 0): 125},
 {(1, 0, 0): 1, (2, 0, 0): 1, (0, 0, 1): 1, (0, 0, 0): 116},
 {(1, 1, 0): 1, (0, 0, 0): 125},
 {(1, 1, 1): 1, (0, 2, 0): 1, (0, 3, 0): 1, (0, 0, 0): 195},
 {(1, 1, 0): 1, (0, 1, 0): 201},
 {(1, 0, 0): 1, (0, 0, 0): 164},
 {},
 {},
 {}]

In [67]:
len(alphabet_data_struct)

10

In [68]:
len(alphabet_data_struct[0])

3

In [82]:
import pickle

fname1 = "/Users/gc3045/laml2_experiments/tlsdata/TLS_097_unfiltered_cm_priors.pickle"
fname2 = "/Users/gc3045/laml2_experiments/tlsdata/TLS_097_unfiltered_cm_state2indel.pickle"
fin1 = open(fname1, "rb")
fin2 = open(fname2, "rb")

priors1 = pickle.load(fin1)
priors2 = pickle.load(fin2)


In [83]:
priors1.keys(), priors1[0]

(dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
 {1: 0.4285714285714285,
  2: 0.2,
  3: 0.9571428571428572,
  4: 0.9142857142857144,
  5: 0.0142857142857142,
  6: 0.1857142857142857,
  7: 0.9285714285714286,
  8: 0.3428571428571428,
  9: 0.0142857142857142,
  10: 0.9,
  11: 0.3142857142857143,
  12: 0.5714285714285714,
  13: 0.5428571428571428,
  14: 0.9428571428571428,
  15: 0.0285714285714285,
  16: 0.3,
  17: 0.1285714285714285,
  18: 0.5428571428571428,
  19: 0.2142857142857142,
  20: 0.7714285714285715,
  21: 0.8,
  22: 0.1,
  23: 0.0142857142857142,
  24: 0.9428571428571428,
  25: 0.0428571428571428,
  26: 0.2428571428571428,
  27: 0.9,
  28: 0.0857142857142857,
  29: 0.1142857142857142,
  30: 0.6428571428571429,
  31: 0.3571428571428571,
  32: 0.1142857142857142,
  33: 0.5714285714285714,
  34: 0.0285714285714285,
  35: 0.0857142857142857,
  36: 0.3857142857142857,
  37: 0.0714285714285714,
  38: 0.014285714

In [79]:
priors2.keys(), priors2[0]

(dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
 {1: '[116:1D]',
  2: '[117:1I]A',
  3: '[104:1I]T',
  4: '[71:74D]',
  5: '[107:10D]',
  6: '[107:59D]',
  7: '[101:165D]',
  8: '[110:3D]',
  9: '[68:101D]',
  10: '[103:1D]',
  11: '[117:2I]AA',
  12: '[111:11D]',
  13: '[112:70D]',
  14: '[105:61D]',
  15: '[106:61D]',
  16: '[113:12D]',
  17: '[115:2D]',
  18: '[104:1I]C',
  19: '[111:58D]',
  20: '[113:10D]',
  21: '[99:139D]',
  22: '[106:11D]',
  23: '[114:1I]G',
  24: '[110:61D]',
  25: '[112:1I]G',
  26: '[114:1I]C',
  27: '[108:14D]',
  28: '[114:3D]',
  29: '[104:1I]T[117:1I]A',
  30: '[117:2I]CA',
  31: '[99:13D]',
  32: '[104:1D][116:12D]',
  33: '[113:4D]',
  34: '[105:14D]',
  35: '[112:13I]GCAAACATGCCCC',
  36: '[114:54D]',
  37: '[117:13I]CATGCCCCAAACA',
  38: '[117:14I]CATGCCCCAAAACA',
  39: '[114:48D]',
  40: '[116:57D]',
  41: '[109:2D]',
  42: '[106:12D]',
  43: '[116:55D]',
  44: '[117:2I]GG

In [None]:
# prior file needs to be a list of lists, corresponding to cassettes
# each cassette is a list of target sites
# each target site is a dictionary of possible states
# normalized to sum to 1

In [89]:
K = 10
J = 3
prior_txt_file = []
for i in range(K):
    cassette_list = []
    for j in range(J):
        # print(i*J + j)
        site_index = i*J + j
        target_site_dict = priors1[site_index]
        cassette_list.append(target_site_dict)
    prior_txt_file.append(cassette_list)
        
# for site_index in priors1.keys():
#     priors1[site_index]

In [90]:
prior_txt_file

[[{1: 0.4285714285714285,
   2: 0.2,
   3: 0.9571428571428572,
   4: 0.9142857142857144,
   5: 0.0142857142857142,
   6: 0.1857142857142857,
   7: 0.9285714285714286,
   8: 0.3428571428571428,
   9: 0.0142857142857142,
   10: 0.9,
   11: 0.3142857142857143,
   12: 0.5714285714285714,
   13: 0.5428571428571428,
   14: 0.9428571428571428,
   15: 0.0285714285714285,
   16: 0.3,
   17: 0.1285714285714285,
   18: 0.5428571428571428,
   19: 0.2142857142857142,
   20: 0.7714285714285715,
   21: 0.8,
   22: 0.1,
   23: 0.0142857142857142,
   24: 0.9428571428571428,
   25: 0.0428571428571428,
   26: 0.2428571428571428,
   27: 0.9,
   28: 0.0857142857142857,
   29: 0.1142857142857142,
   30: 0.6428571428571429,
   31: 0.3571428571428571,
   32: 0.1142857142857142,
   33: 0.5714285714285714,
   34: 0.0285714285714285,
   35: 0.0857142857142857,
   36: 0.3857142857142857,
   37: 0.0714285714285714,
   38: 0.0142857142857142,
   39: 0.2714285714285714,
   40: 0.7428571428571429,
   41: 0.3714285714

In [91]:
json_string = json.dumps(prior_txt_file, indent=4)
with open("/Users/gc3045/laml2_experiments/proc_realdata/TLS_097_priors.json", "w") as f:
    f.write(json_string)