In [18]:
import json
import pandas as pd


def extract_metadata_from_ireceptor_json(json_file, output_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    extracted_data = []
    for repertoire in data['Repertoire']:
        repertoire_id = repertoire['repertoire_id']
        ir_sequence_count = repertoire['ir_sequence_count']
        study_id = repertoire['study']['study_id']
        subject_id = repertoire['subject']['subject_id']
        study_group_description = None
        if 'diagnosis' in repertoire['subject'] and repertoire['subject']['diagnosis']:
            study_group_description = repertoire['subject']['diagnosis'][0]['study_group_description']
        for sample in repertoire['sample']:
            sample_id = sample['sample_id']
            tissue_label = sample['tissue']['label']
            cell_subset_label = sample['cell_subset']['label']
            pcr_target_locus = sample['pcr_target'][0]['pcr_target_locus']
            library_generation_method = sample['library_generation_method']
            extracted_data.append({
                'repertoire_id': repertoire_id,
                'study_id': study_id,
                'subject_id': subject_id,
                'sample_id': sample_id,
                'ir_sequence_count': ir_sequence_count,
                'study_group_description': study_group_description,
                'tissue_label': tissue_label,
                'cell_subset_label': cell_subset_label,
                'pcr_target_locus': pcr_target_locus,
                'library_generation_method': library_generation_method
            })
    df = pd.DataFrame(extracted_data)
    df.to_csv(output_file, sep='\t', index=False)

In [19]:
## Extract metadata
extract_metadata_from_ireceptor_json('../data/ir_2025-03-18_rep_seq_metadata.json', '../data/ir_2025-03-18_rep_seq_metadata_from_json.tsv')
extracted_metadata = pd.read_csv('../data/ir_2025-03-18_rep_seq_metadata_from_json.tsv', sep='\t')

## Collection 2: naive CD4+ vs. memory CD4+

In [20]:
TRB = extracted_metadata[extracted_metadata['pcr_target_locus'] == 'TRB']
TRB = TRB[TRB['study_group_description'].notnull()] # remove nan
TRB = TRB[TRB['study_group_description'].str.contains('Control')]

# filter rows where CD4 is found somewhere in the string of the cell_subset_label
TRB = TRB[TRB['cell_subset_label'].notnull()] # remove nan
TRB_CD4 = TRB[TRB['cell_subset_label'].str.contains('CD4')]
TRB_CD4_naive = TRB_CD4[TRB_CD4['cell_subset_label'].str.contains('naive')]
TRB_CD4_naive = TRB_CD4_naive.sort_values(by='ir_sequence_count', ascending=False)

# from T1D study
TRB_CD4_T1D = TRB_CD4_naive[TRB_CD4_naive['study_id'].str.contains('IR-T1D-000001')]
TRB_CD4_T1D = TRB_CD4_T1D.head(5)
TRB_CD4_T1D

Unnamed: 0,repertoire_id,study_id,subject_id,sample_id,ir_sequence_count,study_group_description,tissue_label,cell_subset_label,pcr_target_locus,library_generation_method
4658,6623f3e2229fedfad25d829e,IR-T1D-000001,CerosalettiLab0168,wb93354616,234786,Control (non-diabetic),blood,"naive thymus-derived CD4-positive, alpha-beta ...",TRB,PCR
4688,6623f3ec229fedfad25d82bc,IR-T1D-000001,CerosalettiLab0246,wb36503716,109203,Control (non-diabetic),blood,"naive thymus-derived CD4-positive, alpha-beta ...",TRB,PCR
4680,6623f3ea229fedfad25d82b4,IR-T1D-000001,CerosalettiLab0245,wb38769032,97911,Control (non-diabetic),blood,"naive thymus-derived CD4-positive, alpha-beta ...",TRB,PCR
4668,6623f3e6229fedfad25d82a8,IR-T1D-000001,CerosalettiLab0207,wb32419318,92942,Control (non-diabetic),blood,"naive thymus-derived CD4-positive, alpha-beta ...",TRB,PCR
4676,6623f3e8229fedfad25d82b0,IR-T1D-000001,CerosalettiLab0030,wb16703410,91914,Control (non-diabetic),blood,"naive thymus-derived CD4-positive, alpha-beta ...",TRB,PCR


In [21]:
TRB_CD4_memory = TRB_CD4[TRB_CD4['cell_subset_label'].str.contains('memory')]
TRB_CD4_memory = TRB_CD4_memory.sort_values(by='ir_sequence_count', ascending=False)

# from T1D study
TRB_CD4_memory_T1D = TRB_CD4_memory[TRB_CD4_memory['study_id'].str.contains('IR-T1D-000001')]
TRB_CD4_memory_T1D = TRB_CD4_memory_T1D.head(5)
TRB_CD4_memory_T1D

Unnamed: 0,repertoire_id,study_id,subject_id,sample_id,ir_sequence_count,study_group_description,tissue_label,cell_subset_label,pcr_target_locus,library_generation_method
4704,6623f3f2229fedfad25d82cc,IR-T1D-000001,CerosalettiLab0088,wb72887488,82335,Control (non-diabetic),blood,"central memory CD4-positive, alpha-beta T cell",TRB,PCR
4675,6623f3e8229fedfad25d82af,IR-T1D-000001,CerosalettiLab0030,wb16703410,80719,Control (non-diabetic),blood,"central memory CD4-positive, alpha-beta T cell",TRB,PCR
4692,6623f3ee229fedfad25d82c0,IR-T1D-000001,CerosalettiLab0246,wb98202936,78841,Control (non-diabetic),blood,"central memory CD4-positive, alpha-beta T cell",TRB,PCR
4695,6623f3ef229fedfad25d82c3,IR-T1D-000001,CerosalettiLab0247,wb88715424,76680,Control (non-diabetic),blood,"central memory CD4-positive, alpha-beta T cell",TRB,PCR
4657,6623f3e2229fedfad25d829d,IR-T1D-000001,CerosalettiLab0168,wb93354616,70069,Control (non-diabetic),blood,"central memory CD4-positive, alpha-beta T cell",TRB,PCR


In [22]:
TRB_CD4_naive_memory = pd.concat([TRB_CD4_T1D, TRB_CD4_memory_T1D])
TRB_CD4_naive_memory.to_csv('../data/collections/collection2.tsv', sep='\t', index=False)

## Collection 3: Pancreatic lymph node vs. Spleen tissue

In [23]:
TRB = extracted_metadata[extracted_metadata['pcr_target_locus'] == 'TRB']
TRB = TRB[TRB['study_group_description'].notnull()] # remove nan
TRB = TRB[TRB['study_group_description'].str.contains('No Diabetes')]
TRB = TRB[TRB['tissue_label'].notnull()] # remove nan
TRB = TRB[TRB['cell_subset_label'].notnull()] # remove nan
TRB_CD4 = TRB[TRB['cell_subset_label'].str.contains('CD4')]

TRB_panLN = TRB_CD4[TRB_CD4['tissue_label'].str.contains('pancreatic lymph node')]
TRB_panLN = TRB_panLN.sort_values(by='ir_sequence_count', ascending=False)
TRB_panLN = TRB_panLN.head(3)
TRB_panLN

Unnamed: 0,repertoire_id,study_id,subject_id,sample_id,ir_sequence_count,study_group_description,tissue_label,cell_subset_label,pcr_target_locus,library_generation_method
4295,64189b4a7c7dc3feb341f9cc,DOI:10.1172/JCI.insight.88242,6271,6271_pancreatic lymph node_CD4+CD127+ Tconv,80551,No Diabetes,pancreatic lymph node,"CD4-positive, alpha-beta T cell",TRB,PCR
4277,64189b457c7dc3feb341f9ba,DOI:10.1172/JCI.insight.88242,6254,6254_pancreatic lymph node_CD4+CD127+ Tconv,70450,No Diabetes,pancreatic lymph node,"CD4-positive, alpha-beta T cell",TRB,PCR
4213,64189b317c7dc3feb341f97a,DOI:10.1172/JCI.insight.88242,6279,6279_pancreatic lymph node_CD4+CD127+ Tconv,69047,No Diabetes,pancreatic lymph node,"CD4-positive, alpha-beta T cell",TRB,PCR


In [24]:
TRB_spleen = TRB_CD4[TRB_CD4['tissue_label'].str.contains('spleen')]
TRB_spleen = TRB_spleen.sort_values(by='ir_sequence_count', ascending=False)
TRB_spleen = TRB_spleen.head(6)
TRB_spleen

Unnamed: 0,repertoire_id,study_id,subject_id,sample_id,ir_sequence_count,study_group_description,tissue_label,cell_subset_label,pcr_target_locus,library_generation_method
4227,64189b357c7dc3feb341f988,DOI:10.1172/JCI.insight.88242,6278,6278_spleen_CD4+CD127+ Tconv,114216,No Diabetes,spleen,"CD4-positive, alpha-beta T cell",TRB,PCR
4205,64189b2e7c7dc3feb341f972,DOI:10.1172/JCI.insight.88242,6289,6289_spleen_CD4+CD127+ Tconv,110037,No Diabetes,spleen,"CD4-positive, alpha-beta T cell",TRB,PCR
4310,64189b4f7c7dc3feb341f9db,DOI:10.1172/JCI.insight.88242,6288,6288_spleen_CD4+CD127+ Tconv,94184,No Diabetes,spleen,"CD4-positive, alpha-beta T cell",TRB,PCR
4261,64189b407c7dc3feb341f9aa,DOI:10.1172/JCI.insight.88242,6279,6279_spleen_CD4+CD127+ Tconv,79385,No Diabetes,spleen,"CD4-positive, alpha-beta T cell",TRB,PCR
4286,64189b487c7dc3feb341f9c3,DOI:10.1172/JCI.insight.88242,6271,6271_spleen_CD4+CD127+ Tconv,44325,No Diabetes,spleen,"CD4-positive, alpha-beta T cell",TRB,PCR
4296,64189b4b7c7dc3feb341f9cd,DOI:10.1172/JCI.insight.88242,6254,6254_spleen_CD4+CD127+ Tconv,39667,No Diabetes,spleen,"CD4-positive, alpha-beta T cell",TRB,PCR


In [25]:
# Save TRB_panLN and TRB_spleen in combined csv
TRB_panLN_spleen = pd.concat([TRB_panLN, TRB_spleen])
TRB_panLN_spleen.to_csv('../data/collections/collection3.tsv', sep='\t', index=False)

## Collection 4: CD4+ Tconv vs. CD4+ Treg vs. CD8+

In [26]:
# CD4+ Tconv
TRB_CD4_panLN = TRB_panLN

# CD4+ Treg
TRB_CD4_treg = TRB[TRB['cell_subset_label'].str.contains('regulatory')]
TRB_CD4_treg = TRB_CD4_treg[TRB_CD4_treg['tissue_label'].str.contains('pancreatic lymph node')]
TRB_CD4_treg = TRB_CD4_treg.sort_values(by='ir_sequence_count', ascending=False)
TRB_CD4_treg = TRB_CD4_treg.head(5)

# CD8+
TRB_CD8 = TRB[TRB['cell_subset_label'].str.contains('CD8')]
TRB_CD8 = TRB_CD8[TRB_CD8['tissue_label'].str.contains('pancreatic lymph node')]
TRB_CD8 = TRB_CD8.sort_values(by='ir_sequence_count', ascending=False)
TRB_CD8 = TRB_CD8.head(5)

In [27]:
collection4 = pd.concat([TRB_CD4_panLN, TRB_CD4_treg, TRB_CD8])
collection4.to_csv('../data/collections/collection4.tsv', sep='\t', index=False)

In [28]:
collection4

Unnamed: 0,repertoire_id,study_id,subject_id,sample_id,ir_sequence_count,study_group_description,tissue_label,cell_subset_label,pcr_target_locus,library_generation_method
4295,64189b4a7c7dc3feb341f9cc,DOI:10.1172/JCI.insight.88242,6271,6271_pancreatic lymph node_CD4+CD127+ Tconv,80551,No Diabetes,pancreatic lymph node,"CD4-positive, alpha-beta T cell",TRB,PCR
4277,64189b457c7dc3feb341f9ba,DOI:10.1172/JCI.insight.88242,6254,6254_pancreatic lymph node_CD4+CD127+ Tconv,70450,No Diabetes,pancreatic lymph node,"CD4-positive, alpha-beta T cell",TRB,PCR
4213,64189b317c7dc3feb341f97a,DOI:10.1172/JCI.insight.88242,6279,6279_pancreatic lymph node_CD4+CD127+ Tconv,69047,No Diabetes,pancreatic lymph node,"CD4-positive, alpha-beta T cell",TRB,PCR
4303,64189b4d7c7dc3feb341f9d4,DOI:10.1172/JCI.insight.88242,6174,6174_pancreatic lymph node_CD4+CD127–CD25+ Treg,128213,No Diabetes,pancreatic lymph node,regulatory T cell,TRB,PCR
4197,64189b2c7c7dc3feb341f96a,DOI:10.1172/JCI.insight.88242,6289,6289_pancreatic lymph node_CD4+CD127–CD25+ Treg,81171,No Diabetes,pancreatic lymph node,regulatory T cell,TRB,PCR
4290,64189b497c7dc3feb341f9c7,DOI:10.1172/JCI.insight.88242,6271,6271_pancreatic lymph node_CD4+CD127–CD25+ Treg,61689,No Diabetes,pancreatic lymph node,regulatory T cell,TRB,PCR
4222,64189b337c7dc3feb341f983,DOI:10.1172/JCI.insight.88242,6279,6279_pancreatic lymph node_CD4+CD127–CD25+ Treg,59894,No Diabetes,pancreatic lymph node,regulatory T cell,TRB,PCR
4289,64189b487c7dc3feb341f9c6,DOI:10.1172/JCI.insight.88242,6288,6288_pancreatic lymph node_CD4+CD127–CD25+ Treg,58610,No Diabetes,pancreatic lymph node,regulatory T cell,TRB,PCR
4313,64189b507c7dc3feb341f9de,DOI:10.1172/JCI.insight.88242,6174,6174_pancreatic lymph node_CD8+ T cell,66235,No Diabetes,pancreatic lymph node,"CD8-positive, alpha-beta T cell",TRB,PCR
4297,64189b4b7c7dc3feb341f9ce,DOI:10.1172/JCI.insight.88242,6271,6271_pancreatic lymph node_CD8+ T cell,65531,No Diabetes,pancreatic lymph node,"CD8-positive, alpha-beta T cell",TRB,PCR


## Check number of sequences

In [20]:
clean_data_dir = "../data/ihub_uploads"
files_to_clean = os.listdir(clean_data_dir)

for file in files_to_clean:
    data = pd.read_csv(f"{clean_data_dir}/{file}", sep='\t')
    print(f"{file}: {len(data)} {len(data.drop_duplicates())}")

memory_cd4_3.tsv: 69670 69670
spleen_1.tsv: 88441 88441
spleen_0.tsv: 94488 94488
memory_cd4_2.tsv: 61860 61860
memory_cd4_0.tsv: 58686 58686
spleen_2.tsv: 60257 60257
spleen_3.tsv: 77987 77987
memory_cd4_1.tsv: 64293 64293
memory_cd4_4.tsv: 60559 60559
naive_cd4_0.tsv: 198149 198149
pancreatic_lymph_node_2.tsv: 66126 66126
naive_cd4_1.tsv: 90025 90025
naive_cd4_3.tsv: 79377 79377
pancreatic_lymph_node_1.tsv: 59339 59339
pancreatic_lymph_node_0.tsv: 53523 53523
naive_cd4_2.tsv: 69364 69364
naive_cd4_4.tsv: 80464 80464
