# Tutorial

## Prepare Inputs

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### BCRs input file, BCR_original_emb file, and rna input file
1. Ensure that each file includes an index column labeled "barcode". This column serves as a unique identifier for each cell.
2. Verify that the cells are aligned in the same order across all three files.
3. The BCR file must include the following columns: "fwr1", "cdr1", "cdr2", "fwr2", "cdr3", "fwr3", and "fwr4".

In [2]:
bcr = pd.read_csv("exampledata/example_bcr.csv", index_col="barcode")
rna = pd.read_csv("exampledata/example_rna.csv", index_col="barcode")
assert(bcr.index.tolist() == rna.index.tolist())

In [3]:
bcr.head()

Unnamed: 0_level_0,contig_id,is_cell,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,...,umis,raw_clonotype_id,raw_consensus_id,exact_subclonotype_id,sample,label,whole_seq,new_whole_seq,v_identity,SHM
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGCAATAGCAA-1_06,AAACCTGCAATAGCAA-1_contig_1_06,True,True,619,IGH,IGHV2-5,,IGHJ4,IGHG1,True,...,5,clonotype455,clonotype455_consensus_1,1,6,spikepositive,QITLKESGPTLVKPTQTLTLTCTFSGFSLTSGMGVGWIRQPPGKAL...,06:QITLKESGPTLVKPTQTLTLTCTFSGFSLTSGMGVGWIRQPPG...,0.03754,0.061453
AAACCTGCACAACTGT-1_06,AAACCTGCACAACTGT-1_contig_2_06,True,True,537,IGH,IGHV1-24,,IGHJ4,IGHM,True,...,52,clonotype278,clonotype278_consensus_1,1,6,spikepositive,QVQLVQSGAEVKKPGASVKVSCKVSGYTLTELSMHWVRQAPGKGLE...,06:QVQLVQSGAEVKKPGASVKVSCKVSGYTLTELSMHWVRQAPGK...,0.0,0.008596
AAACCTGCAGCCTGTG-1_06,AAACCTGCAGCCTGTG-1_contig_1_06,True,True,656,IGH,IGHV4-59,IGHD3-22,IGHJ4,IGHG1,True,...,8,clonotype1394,clonotype1394_consensus_1,1,6,spikepositive,QVQLQESGPGLVKPSETLSLICTVSGGSISSYYWSWIRQPAGKGLE...,06:QVQLQESGPGLVKPSETLSLICTVSGGSISSYYWSWIRQPAGK...,0.02397,0.066489
AAACCTGCAGTCAGCC-1_06,AAACCTGCAGTCAGCC-1_contig_2_06,True,True,670,IGH,IGHV1-2,,IGHJ5,IGHG1,True,...,8,clonotype969,clonotype969_consensus_1,1,6,spikepositive,QVQLVQSGAEVKNIGVSVKVSCKASGYTFTDYYIHWVRQAPGQGLE...,06:QVQLVQSGAEVKNIGVSVKVSCKASGYTFTDYYIHWVRQAPGQ...,0.06081,0.095368
AAACCTGGTTCCTCCA-1_06,AAACCTGGTTCCTCCA-1_contig_2_06,True,True,692,IGH,IGHV1-18,IGHD3-22,IGHJ4,IGHG1,True,...,21,clonotype1466,clonotype1466_consensus_1,1,6,spikepositive,QVQLVQSGDEVKKPGASVKVSCEASGYTFISYGIAWVRQAPGQGLE...,06:QVQLVQSGDEVKKPGASVKVSCEASGYTFISYGIAWVRQAPGQ...,0.11576,0.139594


In [4]:
rna.head()

Unnamed: 0_level_0,IGJ,IGLL5,RP11.685N3.1,RP11.731F5.2,CH17.132F21.1,HIST1H4C,UBE2C,DERL3,TUBA1B,MKI67,...,AZI2,HHLA3,BFSP2.AS1,PIK3CA,TARSL2,DIS3,CTD.2256P15.2,ZNF283,FAHD2B,RP11.485G4.2
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGCAATAGCAA-1_06,0.345539,0.4116,-0.086132,-0.151236,0.03459,0.737681,0.000468,0.122183,1.349319,0.002903,...,0.771746,-0.000238,0.000177,0.016492,0.660869,-0.059796,0.012625,0.007151,0.014972,0.026571
AAACCTGCACAACTGT-1_06,0.117444,0.985152,-0.210269,-0.231358,-0.074831,1.304759,-0.003087,0.564784,0.796385,0.000614,...,0.126555,-0.057113,3.6e-05,0.127062,0.248703,0.093876,0.005728,0.025548,0.009291,-0.002509
AAACCTGCAGCCTGTG-1_06,0.118185,0.304202,1.553921,-0.071504,-0.15244,1.007265,-0.005297,0.122008,1.350721,0.001795,...,0.008371,-0.001826,0.00012,-0.070642,-0.085906,0.005892,-0.018601,0.030213,-0.042815,-0.001975
AAACCTGCAGTCAGCC-1_06,0.12323,1.773176,-0.150341,-0.035437,0.003177,0.666422,0.046367,0.030498,1.141658,0.000546,...,0.069885,-0.079806,4.1e-05,0.015928,0.099969,0.084648,0.014874,-0.017749,0.041447,0.007477
AAACCTGGTTCCTCCA-1_06,0.1626,0.300888,-0.164492,0.755047,-0.039643,0.022945,0.017033,0.154383,0.842697,0.007529,...,0.025873,0.010463,-0.009411,0.5731,-0.028001,0.097279,0.011172,0.013083,0.161455,-0.006934


### Genreate original BCR embeddings

First, we download the pre-trained BCR encoder


In [5]:
from CoMBCR.utils import download_BCRencoder

download_BCRencoder()

Fetching 8 files: 100%|██████████| 8/8 [00:28<00:00,  3.57s/it]

Download Finished. Path /mnt/d/CoMBCR/CoMBCR/BCRencoder





Please clone or download the "runberta.py" in this github. This file is used to measure the original distances between BCRs. We recommend using our default pre-trained encoder, though any encoder can be used to encode BCRs.

Here we directly used the original BCR embeddings under the exampledata

In [6]:
bcrori = pd.read_csv("exampledata/example_bcrori.csv", index_col="barcode")
bcrori.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGCAATAGCAA-1_06,-0.324237,-0.22824,-0.24107,0.951063,-0.688552,-0.024392,-0.455126,-0.142796,0.07106,0.872651,...,0.433799,-0.089641,0.199088,-0.619232,0.433234,-0.507768,0.094147,0.622088,-0.438514,-0.548302
AAACCTGCACAACTGT-1_06,-0.836004,-0.281383,-1.062309,0.253088,-0.613961,0.173742,-0.924163,0.29576,0.215318,0.514034,...,0.113509,0.117395,-0.115959,-0.211876,0.793687,-0.66687,-0.068236,0.275671,0.094568,-0.396323
AAACCTGCAGCCTGTG-1_06,-0.388252,-0.081444,-0.394865,1.480731,-0.288159,-0.132324,-0.263037,-0.209172,0.423102,0.569923,...,-0.231537,-0.353619,0.118815,-1.144104,0.398278,0.067934,-0.181059,0.760884,-0.498049,-0.671089
AAACCTGCAGTCAGCC-1_06,-0.912942,0.000863,0.077432,0.949241,0.020408,0.136745,-0.283928,-0.212797,0.105468,0.71184,...,-0.58794,0.107624,-0.320966,-1.219826,0.783442,0.099214,0.109777,0.096048,-0.420305,-1.142159
AAACCTGGTTCCTCCA-1_06,-0.079208,-0.45523,-0.97858,1.132275,-0.574163,0.258197,-0.157592,-0.350995,0.127573,0.343525,...,0.351599,-0.057017,0.099804,-1.488126,0.658145,0.03958,-0.006186,-0.1961,-0.56417,-0.376476


In [7]:
assert(bcr.index.tolist() == bcrori.index.tolist())

### Run CoMBCR

## steps
CoMBCR contains the parameters as follows:
1. bcrpath: (Required) The path to the BCR sequences file.
2. rnapath:  (Required) The path to the gene expression file.
3. bcroriginal: (Required) The path to bcr original embedding file.
4. outdir:  (Required) The directory where the best checkpoint file and the output embeddings will be stored.
5. checkpoint: default is "best_network.pth". This parameter specifies the name of the file where the best model checkpoint will be saved.
6. lr: default is 1e-6.
7. lam: default is 1e-1, the inner parameter (Parameter alpha in paper).
8. batch_size: default 256.
9. epochs: default 200.
10. patience: default 15, the patience for early stopping.
11. lr_step: default [40,100], These are the milestones for the MultiStepLR setting, which adjusts the learning rate at specified epochs.
12. encoderprofile_in_dim: default 5000. Adjust this parameter if the number of input genes differs from 5000.
13. separatebatch: The default is False. If set to True, BCRs from different samples will be treated as distinct BCRs. Ensure that your BCR input file contains a "sample" column if you choose to enable this option.

The codes below return numpy arrays for BCR embeddings and gex embeddings. Meanwhile, it will output "bcrembedding.csv" and "gexemedding.csv" under the outdir you designated.  

If CUDA raised error, this is due to a crush with the previous loaded model. Please restart the jupyter notebook and directly run the cell below.

In [2]:
from CoMBCR.CoMBCR import CoMBCR_main

bcremb, gexemb = CoMBCR_main(bcrpath="exampledata/example_bcr.csv", 
            rnapath="exampledata/example_rna.csv", 
            bcroriginal="exampledata/example_bcrori.csv", 
            outdir="example_outdir",
            epochs=1,
            batch_size=32,
            encoderprofile_in_dim=5000)

learning rate is  1e-06
Adjusting learning rate of group 0 to 1.0000e-06.
Adjusting learning rate of group 0 to 1.0000e-06.
Epoch:[0/1]	loss:7.12308	loss_cmc:3.473290	loss_p2p:3.732409	loss_b2b:0.003556


In [11]:
bcremb.shape

(1000, 256)

In [12]:
gexemb.shape

(1000, 256)

Read the output files "bcrembedding.csv" and "gexembedding.csv" located in the designated output directory. Please note that these CSV files directly store the numpy arrays and, as such, do not include any "barcode" column. When reading these files, ensure that you do not specify any index column.

In [13]:
bcremb = pd.read_csv("example_outdir/Embeddings/bcrembeddings")
gexemb = pd.read_csv("example_outdir/Embeddings/gexembeddings")

In [14]:
bcremb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,0.015348,-0.08786,-0.013923,0.05635,-0.0171,0.068572,0.005855,-0.050495,-0.005957,0.008949,...,-0.136085,-0.076334,-0.08386,-0.018186,-0.109076,-0.055655,0.039805,-0.069356,-0.029027,-0.031129
1,0.124641,0.031098,0.027163,0.138613,-0.001733,0.038861,0.051714,0.011094,-0.049876,0.009269,...,-0.10601,0.002925,-0.024259,-0.017597,-0.042338,-0.005912,0.011963,-0.059456,0.017165,0.023257
2,0.035745,-0.049108,-0.037657,0.123449,0.000613,0.05564,0.086152,0.093559,0.029612,-0.020621,...,-0.182125,0.02351,-0.041843,-0.049026,-0.042913,0.086104,-0.060631,-0.022182,0.026433,0.007512
3,0.087479,0.025641,-0.032866,0.033859,0.027408,-0.01279,0.118991,0.037054,-0.002722,-0.013472,...,-0.134776,0.024128,0.035573,-0.098892,0.000892,0.075088,0.008346,-0.024887,-0.018854,0.046138
4,0.013984,-0.057683,-0.067793,0.080526,-0.009838,0.049828,0.068106,0.017483,0.001821,0.027285,...,-0.127777,-0.004428,-0.013432,-0.056396,-0.009095,0.052927,-0.116035,0.022259,-0.001118,-0.01425


In [15]:
gexemb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,-0.048509,-0.05078,0.012092,0.054801,-0.077096,-0.071795,0.120492,0.048243,-0.091755,-0.037499,...,0.030512,-0.069922,-0.017588,-0.057541,-0.033826,-0.059658,-0.00702,-0.018299,0.034806,0.07738
1,0.008329,-0.065424,0.061591,-0.017872,-0.059715,-0.04078,0.160052,-0.033229,-0.061964,-0.026953,...,-0.013495,-0.101155,0.022401,-0.036999,-0.039105,-0.100446,-0.006467,-0.010011,0.036884,0.08987
2,-0.035611,-0.070885,0.032202,0.049993,-0.043414,-0.038092,0.121762,0.008126,-0.090482,-0.002106,...,0.000275,-0.075901,0.017932,-0.014056,-0.033765,-0.051641,-0.003662,-0.027082,0.036087,0.060277
3,-0.063172,-0.066526,0.0509,0.066959,-0.041829,-0.060583,0.122334,-0.033201,-0.063927,0.005277,...,-0.014794,-0.117377,0.009713,-0.03763,-0.051727,-0.059814,-0.00152,-0.034314,0.050281,0.05894
4,-0.012943,-0.077301,-0.010864,0.059,-0.085983,-0.026742,0.13318,0.004001,-0.066243,-0.045583,...,-0.045505,-0.119614,0.013246,-0.047752,-0.039822,-0.06007,0.023413,0.028433,0.026463,0.055106
