In [4]:
import argparse
import logging
import os
import sys
import re
import csv
import pandas as pd
from collections import OrderedDict

def initialize_logger(logfile, args):
  logger = logging.getLogger('enclone_to_clonotype')
  loglevel = logging.INFO
  logger.setLevel(loglevel)

  formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)-8s - %(message)s')    
 
  # create console handler and set level to info
  handler = logging.StreamHandler()
  handler.setLevel(loglevel)
  handler.setFormatter(formatter)
  logger.addHandler(handler)
 
  # create error file handler and set level to error
  handler = logging.FileHandler(logfile, "w")
  handler.setLevel(loglevel)
  handler.setFormatter(formatter)
  logger.addHandler(handler)
 
  return(logger)

def getValidFilename(s):
  s = str(s).strip().replace(' ', '_')
  return re.sub(r'(?u)[^-\w.]', '', s)
  
def runCommand(command, logger):
  logger.info("run : " + command )
  os.system(command)
  
def check_file(filename, parser):
  if not os. path. isfile(filename):
    print("error: file not exists: " + filename)
    parser.print_help()
    sys.exit(1)

def read_file_map(fileName):
  result = OrderedDict()
  with open(fileName) as fh:
    for line in fh:
      filepath, name = line.strip().split('\t', 1)
      result[name] = filepath.strip()
  return(result)

def sortByCells(val): 
    return val['n'] 

def convert(enclone_file, chain_file, output_file, logger):
  cdr3map = {}
  with open(chain_file) as fin:
    fin.readline()
    for line in fin:
      cdr3, chain = line.strip().split('\t', 1)
      cdr3map[cdr3] = chain

  rows = []
  with open(enclone_file, "rt") as fin:
    reader = csv.DictReader(fin)
    for row in reader:
      rows.append(row)

  for row in rows:
    row['n'] = int(row['n'])

  rows.sort(key=sortByCells, reverse=True)

  clone_sample_cell_map = {}
  samples = []
  clono_index = 0
  for row in rows:
    clono_index += 1
    sample_map = {}
    clone_sample_cell_map[clono_index] = sample_map
    barcodes = row['barcodes'].split(',')
    if '_' in barcodes[0]:
      for barcode in barcodes:
        m = re.search('(.+)_', barcode)
        sample = m.group(1)
        samples.append(sample)
        if sample in sample_map:
          sample_map[sample] += 1
        else:
          sample_map[sample] = 1

  samples = sorted([s for s in set(samples)])
  print(samples)

  total_cells = sum(row['n'] for row in rows)
  with open(output_file, "wt") as fout:
    clono_index = 0
    fout.write("clonotype_id,frequency,proportion,cdr3s_aa,cdr3s_nt,%s,TRBV,TRBJ,cells\n" % ",".join(samples))
    for row in rows:
      clono_index += 1
      sample_map = clone_sample_cell_map[clono_index]
      aa1 = row['cdr3_aa1']
      aa2 = row['cdr3_aa2']
      dna1 = row['cdr3_dna1']
      dna2 = row['cdr3_dna2']
      trbv=row['v_name1']
      trbj=row['j_name1']
      aas = []
      dnas = []
      if aa1 != "":
        chain = cdr3map[aa1]
        aas.append(chain + ":" + aa1)
        dnas.append(chain + ":" + dna1)
      if aa2 != "":
        chain = cdr3map[aa2]
        aas.append(chain + ":" + aa2)
        dnas.append(chain + ":" + dna2)
      aas.sort()
      dnas.sort()
      barcodes = row['barcodes']
      barcodes = barcodes.replace(",",";")
      cell_counts = []
      for sample in samples:
        cell_counts.append(str(sample_map[sample]) if sample in sample_map else "0")
      fout.write("clonotype%d,%s,%s,%s,%s,%s,%s,%s,%s\n" % (
        clono_index, 
        row['n'], 
        row['n']*1.0/total_cells, 
        ";".join(aas), 
        ";".join(dnas), 
        ",".join(cell_counts),
        trbv,
        trbj,
        barcodes
        ))

  logger.info("done")


In [8]:

input="/data/h_gelbard_lab/projects/20220508_scRNA_3669/clonotype_2_enclone/result/AG3669.csv"
chain="/data/h_gelbard_lab/projects/20220508_scRNA_3669/clonotype_1_merge/result/all_contig_annotations.json.cdr3"
output="/data/h_gelbard_lab/projects/20220508_scRNA_3669/clonotype_4_immunarch/"
converted="/data/h_gelbard_lab/projects/20220508_scRNA_3669/clonotype_3_convert/result/clonotypes.csv"


In [11]:
enclone=pd.read_csv(input)
enclone.head(2)

  enclone=pd.read_csv(input)


Unnamed: 0,n,datasets,origins,donors,entropy_cell,near,far,dref,dref_aa,ext,...,vj_aa_nl4,var_aa4,group_id,group_ncells,clonotype_id,clonotype_ncells,nchains,exact_subclonotype_id,barcodes,result_barcodes
0,14,result,s1,d1,0.00;0.00;0.00;0.00;0.00;0.00;0.00;0.00;0.00;0...,,,2,1,,...,,,1,37,1,37,3,1,"LM_ctrl_AAAGCAAAGCAATCTC-1,LM_ctrl_AAGGAGCAGGC...","LM_ctrl_AAAGCAAAGCAATCTC-1,LM_ctrl_AAGGAGCAGGC..."
1,12,result,s1,d1,0.00;0.00;0.00;0.00;0.00;0.00;0.00;0.00;0.00;0...,,,2,1,,...,,,1,37,1,37,3,2,"LM_ctrl_ACATACGCACCTGGTG-1,LM_ctrl_ATCATCTCATA...","LM_ctrl_ACATACGCACCTGGTG-1,LM_ctrl_ATCATCTCATA..."


In [14]:
enclone.iloc[0]['barcodes'].split(',')

['LM_ctrl_AAAGCAAAGCAATCTC-1',
 'LM_ctrl_AAGGAGCAGGCGACAT-1',
 'LM_ctrl_ACTTTCAAGAAGAAGC-1',
 'LM_ctrl_CAGATCACAAATACAG-1',
 'LM_ctrl_CATGACAGTAAACACA-1',
 'LM_ctrl_CCGTACTAGTGGTCCC-1',
 'LM_ctrl_CGAATGTGTGGTCTCG-1',
 'LM_ctrl_GCTTGAACAGGAATCG-1',
 'LM_ctrl_GTACTTTCAAGCGAGT-1',
 'LM_ctrl_TCACGAAAGCATCATC-1',
 'LM_estrogen_CGGAGTCGTTCGCTAA-1',
 'LM_estrogen_GAACGGACAAGGCTCC-1',
 'LM_estrogen_GCTGCGATCACATACG-1',
 'LM_estrogen_GTTCGGGCACATGTGT-1']

In [10]:
ct=pd.read_csv(converted)
ct.head(2)

Unnamed: 0,clonotype_id,frequency,proportion,cdr3s_aa,cdr3s_nt,KC_ctrl,KC_estrogen,LM_ctrl,LM_estrogen,MD_ctrl,MD_estrogen,TRBV,TRBJ,cells
0,clonotype1,27,0.007686,TRA:CAVSATPARLMF;TRB:CASRANDGQETQYF,TRA:TGTGCTGTGAGTGCCACACCGGCCAGACTCATGTTT;TRB:T...,0,0,20,7,0,0,TRBV6-5,TRBJ2-5,LM_ctrl_ACACCCTAGTTACGGG-1;LM_ctrl_ACCTTTAGTCC...
1,clonotype2,27,0.007686,TRB:CASRANDGQETQYF,TRB:TGTGCCAGCAGAGCCAACGACGGGCAAGAGACCCAGTACTTC,0,0,13,5,0,9,TRBV6-5,TRBJ2-5,LM_ctrl_AAAGTAGTCTTGAGAC-1;LM_ctrl_AAATGCCAGGC...


In [19]:
import regex as re

df5=enclone.head(5)

for index, row in df5.iterrows():
  sample_map = {}
  barcodes = row['barcodes'].split(',')
  for barcode in barcodes:
    m = re.search('(.+)_(.+)', barcode)
    sample = m.group(1)
    bc = m.group(2)
    sample_map.setdefault(sample, []).append(bc)

  cdr3_nt = row['cdr3_dna1']
  v_gene = row['v_name1']
  j_gene = row['j_name1'] 
  d_gene = row['d_name1']
      dna1 = row['cdr3_dna1']
      dna2 = row['cdr3_dna2']
      trbj=row['j_name1']

  for sample in sorted(sample_map.keys()):
    umis = len(sample_map[sample])

  v_gene = 
print(sample_map)


{'LM_ctrl': ['ACACCCTAGTTACGGG-1', 'ACCTTTAGTCCGACGT-1', 'ACTGCTCAGCCCAGCT-1', 'AGGGATGGTCGATTGT-1', 'AGTCTTTGTTTGACAC-1', 'ATAACGCTCCGTCATC-1', 'ATCCGAATCCACTCCA-1', 'ATTACTCTCTCGGACG-1', 'CAGCTAAGTCCATGAT-1', 'CCTCAGTAGCTTTGGT-1', 'GACGCGTGTTCATGGT-1', 'GACGTGCCACGCCAGT-1', 'GAGTCCGAGTTCCACA-1', 'GCCAAATAGATGTCGG-1', 'GCGACCAGTCTAGAGG-1', 'GTATCTTAGAATGTGT-1', 'GTGTTAGAGCAATATG-1', 'TAGCCGGTCTGATTCT-1', 'TCACGAAGTATGAATG-1', 'TGAGCATAGTACGATA-1'], 'LM_estrogen': ['CACACCTAGCCAGTTT-1', 'CACAGGCAGTGTACGG-1', 'CATGACATCTGTCAAG-1', 'CCAATCCAGATCTGCT-1', 'CGATGGCGTACGAAAT-1', 'CTGTGCTCATTCGACA-1', 'GCTGGGTGTTCGCTAA-1']}
