In [40]:
import configparser
import dataclasses
import os
import re
import shutil
import statistics
from collections import Counter
from itertools import chain
from pathlib import Path
from time import perf_counter

import pandas as pd
from Bio import SeqIO, pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Seq import Seq
import logging
import xlsxwriter

import utils
from fns import *

pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


## START LOGGING 

In [2]:
Path('logs').mkdir(parents=True, exist_ok=True)
log = utils.make_logger("logs/rp_log")
log.info("\nSTART")
log.info("\nCHECKING CONFIGURATION FILE CONTENTS...")

## GET PARAMS FROM CONFIG FILE

In [3]:
# checking config file
# if any error occurs then program terminates; then check the config file
config = configparser.ConfigParser()
config.read_file(open('config.ini'))

abi_sequence_folder = checking_dirs(config['Paths']['abi_sequence_folder'], log, log_msg=True, create_dir=False)
vh_template_sequence_folder = checking_dirs(config['Paths']['vh_template_sequence_folder'], log, log_msg=True, create_dir=False)
vl_template_sequence_folder = checking_dirs(config['Paths']['vl_template_sequence_folder'], log, log_msg=True, create_dir=False)
results_dir = checking_dirs(config['Paths']['results_dir'], log, log_msg=True, create_dir=True)
h3_nt_data_sheet_filepath = check_files(config['Files']['h3_nt_data_sheet_filepath'], log)
df = check_tsv_file(h3_nt_data_sheet_filepath, log)
excel_path_file_name = create_results_excel_file_path(config['Paths']['results_dir'], config['Files']['output_excel_file_name'])

# patterns
pat_vh, pat_vl = get_patterns(config)
# patterns to remove from abi names
patrm_vh_abi, patrm_vl_abi = get_patterns_to_rm(config)
# patterns to remove from genbank names
patrm_vh_gb, patrm_vl_gb = get_patterns_to_rm_from_genbank(config)

# get alignment parameters
par_match, par_missmatch, par_open, par_extend, par_filter_thresh = get_alignment_params(config)

# get alignment start and end regions
#vh_seq_start_a, vh_seq_end_a, vl_seq_start_a, vl_seq_end_a = get_alignment_seq_start_and_end(config)

# creating dirs for copying matched abi files
res_dir_vh = checking_dirs(f"{results_dir}/{pat_vh}", log, log_msg=True, create_dir=True)
res_dir_vl = checking_dirs(f"{results_dir}/{pat_vl}", log, log_msg=True, create_dir=True)


## GET ABI FILES

In [4]:
log.info("\nCHECKING THE AB1 FILES...")
vh_abi_dict = {i.name.replace('.abi', '').replace(patrm_vh_abi, '') : str(i) for i in sorted([*Path(abi_sequence_folder).glob(f"*{pat_vh}*.abi")])}
vl_abi_dict = {i.name.replace('.abi', '').replace(patrm_vl_abi, '') : str(i) for i in sorted([*Path(abi_sequence_folder).glob(f"*{pat_vl}*.abi")])}

if len(vh_abi_dict) == len(vl_abi_dict):
    log.info(f"[+] INFO: There are {len(vh_abi_dict)} abi files in the abi sequence folder")
else:
    log.warning(f"[+] WARN: There is a differrence in number of abi files in vh: {len(vh_abi_dict)} and vl {len(vl_abi_dict)}")

In [5]:
sample_ids = checking_ab1_files(log, vh_abi_dict, vl_abi_dict)

In [6]:
log.info("\nCHECKING THE GENBANK FILES...")
vh_template_gb_dict =  {i.name.replace('.gb', '') : str(i) for i in sorted(Path(vh_template_sequence_folder).glob('*.gb'))}
vl_template_gb_dict =  {i.name.replace('.gb', '') : str(i) for i in sorted(Path(vl_template_sequence_folder).glob('*.gb'))}

if len(vh_template_gb_dict) == len(vl_template_gb_dict):
    log.info(f"[+] INFO: There are {len(vh_template_gb_dict)} genbank files in each dir vh and vl")
else:
    log.warning(f"[+] WARN: There is a differrence in number of genbank files in vh: {len(vh_template_gb_dict)} and vl {len(vl_template_gb_dict)}")

In [7]:
# get alignment start and end regions
vh_nts = extract_nts_from_start_and_end_from_genbank(vh_template_gb_dict, 6)
vl_nts = extract_nts_from_start_and_end_from_genbank(vl_template_gb_dict, 6)
vh_seq_start_a, vh_seq_end_a = vh_nts.start_nts.value_counts().nlargest(1).index[0], vh_nts.end_nts.value_counts().nlargest(1).index[0]
vl_seq_start_a, vl_seq_end_a = vl_nts.start_nts.value_counts().nlargest(1).index[0], vl_nts.end_nts.value_counts().nlargest(1).index[0]

## GET PROBE SEQS INTO A DICTIOANRY

In [8]:
# dictionary containing probe seq and name
h3_dict = df.set_index('name', drop=True).to_dict().get('h3_nt')

## FINDING THE PROBES THAT MATCH VH AND CORRESPONDING VL FILES

In [9]:
log.info(f"\nITERATING THROUGH EACH SAMPLE ID")
result_vh, result_vl = [], []
for sample in sample_ids:
    # print(f">>{sample}")
    _vhabi, _vlabi = get_abi_file_path(key=sample, vh_abi_dict=vh_abi_dict, vl_abi_dict=vl_abi_dict)
    _vh_d = get_seqobj_from_abi(_vhabi)  # returns a seq record obj of VH
    _vl_d = get_seqobj_from_abi(_vlabi)  # returns a seq record obj of VL
    
    # matching each probe on  VH and VL - normal and revcomp sequence
    vh_prob_search = find_match_on_all_h3probes_v3(log, h3_dict, _vh_d, _vl_d, sample, vh_abi_dict, vl_abi_dict)
    
    if len(vh_prob_search) >=1:
        result_vh.append(vh_prob_search)
log.info(f"\nFINISH ITERATING THROUGH EACH SAMPLE ID")
colnames=["Match","h3_name","sample_id","vh_abi_fp","vl_abi_fp","probe_seq", "vh_init_sr", "vl_inti_sr","vh_sr_seq_r", "vh_sr_trimmed", "vh_sr_tqlst", "vl_sr_seq_r", "vl_sr_trimmed", "vl_sr_tqlst"]
df_vh = pd.DataFrame(chain.from_iterable(result_vh))
df_vh.columns = colnames

In [10]:
log.info(f"[+] INFO: There are {df_vh.shape[0]} matches in df_vh")

## COPY PROBE MATCHED ABI FILES TO A NEW LOC

In [11]:
log.info(f"\nCOPY MATCHED ABI FILES INTO NEW LOCATION")
res_df_copy = copy_mtched_abi_files_to_resdir(log, res_dir_vh, res_dir_vl, df_vh, log_msg=True)
log.info(f"\nFINISH COPYING FILES")

## PAIRWISE ALIGNMENT OF GENBANK FILES WITH MATCHED ABI FILES TO GET THE MATCHING SCORE 
- Iterate through the VH (and VL) genbank file names [the file names are in a dict vh_template_gb_dict]
    - if the gb file name matches with h3_name column in df_vh or df_vl 
      - then save that into a list gb_match_with_dfvx_lst
          - GB_Matched|gb_id|h3_name|sample_id|gb_fp|vh_abi_fp|vl_abi_fp
- gb file name could match an h3_name; also this h3_name might be associated with different sample ids
```
[+] GENBANK_MATCHING:|genbank_id|h3_name|sample_id
[+] GENBANK_MATCHING:|VH-TMH577-hF-005-G08|TMH577-hF-005-G08|TMH577-hIgG1-014-A8_A08
[+] GENBANK_MATCHING:|VH-TMH577-hF-005-G08|TMH577-hF-005-G08|TMH577-hIgG1-014-C4_C04
```

In [12]:
log.info(f"\nFIND THE GB FILENASMES WITH H3 NAMES")
M_gb_abi_vh = find_gb_match_on_all_h3probes_v2(log, vh_template_gb_dict, df_vh, pattern=patrm_vh_gb, log_msg=True)
M_gb_abi_vl = find_gb_match_on_all_h3probes_v2(log, vl_template_gb_dict, df_vh, pattern=patrm_vl_gb, log_msg=True)

In [13]:
all_gb_match = pd.concat([M_gb_abi_vh, M_gb_abi_vl]).reset_index(drop=True)

In [14]:
log.info(f"\nALIGNEMNT BETWEEN THE MATCHED GB FILENASME AND ABI FILES")

In [15]:
vh_gb_abi_match_filtered = run_gb_alignment_and_filtering(M_gb_abi_vh, df_vh, vh_template_gb_dict, vh_seq_start_a, vh_seq_end_a, par_match, par_missmatch, par_open, par_extend, par_filter_thresh, log, log_msg=True, is_data_vl=False)

In [16]:
vl_gb_abi_match_filtered = run_gb_alignment_and_filtering(M_gb_abi_vl, df_vh, vl_template_gb_dict, vl_seq_start_a, vl_seq_end_a, par_match, par_missmatch, par_open, par_extend, par_filter_thresh, log, log_msg=True, is_data_vl=True)

In [17]:
log.info(f"\nMERGING DATAFRAMES")

In [18]:
vh_gb_abi_match_filtered.columns = [ 'VH_'+i  if i not in ['Orient','gbid','H3_name', 'sample_id', 'GB_FP'] else i  for i in vh_gb_abi_match_filtered.columns ]
vl_gb_abi_match_filtered.columns = [ 'VL_'+i  if i not in ['Orient','gbid','H3_name', 'sample_id', 'GB_FP'] else i  for i in vl_gb_abi_match_filtered.columns ]

In [27]:
vh_gb_abi_match_filtered.head()

Unnamed: 0,Orient,gbid,H3_name,sample_id,GB_FP,VH_ABI_FP,VH_Score,VH_Quality_score,VH_Low_quality
0,VH,VH-TMH577-hF-004-G09,TMH577-hF-004-G09,TMH577-hIgG1-013-A4_A04,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,782,60.792839,43
1,VH,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,842,61.114014,43
2,VH,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-014-G9_G09,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,842,61.114014,43
3,VH,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,812,60.70936,37
4,VH,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,812,60.70936,37


In [28]:
vl_gb_abi_match_filtered.head()

Unnamed: 0,Orient,gbid,H3_name,sample_id,GB_FP,VL_ABI_FP,VL_Score,VL_Quality_score,VL_Low_quality
0,VL,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,740,61.027027,42
1,VL,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-014-G9_G09,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,740,61.027027,42
2,VL,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,740,61.308108,47
3,VL,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,740,61.218919,49
4,VL,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,/Users/rp/Desktop/purge/ny-immuno-sanger/data0...,740,61.308108,47


In [19]:
df_sid_abi = pd.concat(
    [pd.DataFrame([ [i, vh_abi_dict.get(i).split("/")[-1], vl_abi_dict.get(i).split("/")[-1]] for i in vh_gb_abi_match_filtered.sample_id.to_list()], 
             columns=["sample_id", "VH", "VL"]),
    pd.DataFrame([ [i, vh_abi_dict.get(i).split("/")[-1], vl_abi_dict.get(i).split("/")[-1]] for i in vl_gb_abi_match_filtered.sample_id.to_list()], 
             columns=["sample_id", "VH", "VL"])], axis=0).drop_duplicates().reset_index(drop=True)

In [20]:
dfx1 = pd.merge(vh_gb_abi_match_filtered,df_sid_abi,on='sample_id')
dfx2 = pd.merge(vl_gb_abi_match_filtered,df_sid_abi,on='sample_id')
dfx1_ss = dfx1[['gbid','H3_name','sample_id', 'VH_Score', 'VH_Quality_score', 'VH_Low_quality', 'VH', 'VL']]
dfx2_ss = dfx2[['gbid','H3_name','sample_id', 'VL_Score', 'VL_Quality_score', 'VL_Low_quality', 'VH', 'VL']]
dfy = pd.merge(dfx1_ss, dfx2_ss, on="sample_id", how='inner')

In [29]:
dfx1_ss.head()

Unnamed: 0,gbid,H3_name,sample_id,VH_Score,VH_Quality_score,VH_Low_quality,VH,VL
0,VH-TMH577-hF-004-G09,TMH577-hF-004-G09,TMH577-hIgG1-013-A4_A04,782,60.792839,43,TMH577-hIgG1-013-A4_A04_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A4_A04_VL79.abi
1,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,842,61.114014,43,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi
2,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-014-G9_G09,842,61.114014,43,TMH577-hIgG1-014-G9_G09_GATC-VH60-2617917.abi,TMH577-hIgG1-014-G9_G09_VL79.abi
3,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,812,60.70936,37,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi
4,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,812,60.70936,37,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi


In [30]:
dfx2_ss.head()

Unnamed: 0,gbid,H3_name,sample_id,VL_Score,VL_Quality_score,VL_Low_quality,VH,VL
0,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,740,61.027027,42,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi
1,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-014-G9_G09,740,61.027027,42,TMH577-hIgG1-014-G9_G09_GATC-VH60-2617917.abi,TMH577-hIgG1-014-G9_G09_VL79.abi
2,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,740,61.308108,47,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi
3,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,740,61.218919,49,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi
4,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,740,61.308108,47,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi


In [31]:
dfy.head()

Unnamed: 0,gbid_x,H3_name_x,sample_id,VH_Score,VH_Quality_score,VH_Low_quality,VH_x,VL_x,gbid_y,H3_name_y,VL_Score,VL_Quality_score,VL_Low_quality,VH_y,VL_y
0,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,842,61.114014,43,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,740,61.027027,42,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi
1,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-014-G9_G09,842,61.114014,43,TMH577-hIgG1-014-G9_G09_GATC-VH60-2617917.abi,TMH577-hIgG1-014-G9_G09_VL79.abi,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,740,61.027027,42,TMH577-hIgG1-014-G9_G09_GATC-VH60-2617917.abi,TMH577-hIgG1-014-G9_G09_VL79.abi
2,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,812,60.70936,37,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,740,61.308108,47,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi
3,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,812,60.70936,37,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,740,61.218919,49,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi
4,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,812,60.70936,37,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,740,61.308108,47,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi


In [21]:
dfz = dfy.copy()
dfz["mean_error_prob"] = [cal_mean_error_prob(i[0], i[1]) for i in zip(dfy.VH_Low_quality.tolist(), dfy.VL_Low_quality.to_list())]

In [22]:
log.info(f"\nWRITE EXCEL FILE")

In [32]:
with pd.ExcelWriter('final_results.xlsx', engine='xlsxwriter') as writer:
    dfz.to_excel(writer, sheet_name='final_res_mean_error_prob', index=False)
    # df_vh.to_excel(writer, sheet_name='H3_probe_matched_VH_and_VL', index=False)
    # res_df_copy.to_excel(writer, sheet_name='VH and VL Copied Files', index=False)
    # M_gb_abi_vh.to_excel(writer, sheet_name='ID matched gb and vh abi', index=False)
    # vh_nts.to_excel(writer, sheet_name='vh_gb_start_end_nts', index=False)
    # vl_nts.to_excel(writer, sheet_name='vl_gb_start_end_nts', index=False)
    vh_gb_abi_match_filtered.to_excel(writer, sheet_name='VH results', index=False)
    vl_gb_abi_match_filtered.to_excel(writer, sheet_name='VL results', index=False)
    

In [26]:
dfz.head(10)

Unnamed: 0,gbid_x,H3_name_x,sample_id,VH_Score,VH_Quality_score,VH_Low_quality,VH_x,VL_x,gbid_y,H3_name_y,VL_Score,VL_Quality_score,VL_Low_quality,VH_y,VL_y,mean_error_prob
0,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,842,61.114014,43,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,740,61.027027,42,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi,5.7e-05
1,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-014-G9_G09,842,61.114014,43,TMH577-hIgG1-014-G9_G09_GATC-VH60-2617917.abi,TMH577-hIgG1-014-G9_G09_VL79.abi,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,740,61.027027,42,TMH577-hIgG1-014-G9_G09_GATC-VH60-2617917.abi,TMH577-hIgG1-014-G9_G09_VL79.abi,5.7e-05
2,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,812,60.70936,37,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,740,61.308108,47,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi,0.00011
3,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,812,60.70936,37,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,740,61.218919,49,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi,0.000106
4,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,812,60.70936,37,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,740,61.308108,47,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi,0.00011
5,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,812,60.70936,37,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,740,61.218919,49,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi,0.000106
6,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,776,60.390313,33,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,746,60.898123,38,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,0.00033
7,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,776,60.390313,33,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,746,61.021448,42,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,0.000282
8,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,776,60.390313,33,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,746,61.340483,51,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,0.000255
9,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,776,61.092784,37,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,746,60.898123,38,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,0.000179


In [34]:
sample_ids[0:20]

['TMH577-hIgG1-013-A10',
 'TMH577-hIgG1-013-A11',
 'TMH577-hIgG1-013-A12',
 'TMH577-hIgG1-013-A1_A01',
 'TMH577-hIgG1-013-A2_A02',
 'TMH577-hIgG1-013-A3_A03',
 'TMH577-hIgG1-013-A4_A04',
 'TMH577-hIgG1-013-A5_A05',
 'TMH577-hIgG1-013-A6_A06',
 'TMH577-hIgG1-013-A7_A07',
 'TMH577-hIgG1-013-A8_A08',
 'TMH577-hIgG1-013-A9_A09',
 'TMH577-hIgG1-013-B10',
 'TMH577-hIgG1-013-B11',
 'TMH577-hIgG1-013-B12',
 'TMH577-hIgG1-013-B1_B01',
 'TMH577-hIgG1-013-B2_B02',
 'TMH577-hIgG1-013-B3_B03',
 'TMH577-hIgG1-013-B4_B04',
 'TMH577-hIgG1-013-B5_B05']

In [35]:
vh_abi_dict.get('TMH577-hIgG1-013-B10')

'/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-B10_GATC-VH60-2617917.abi'

In [36]:
vl_abi_dict.get('TMH577-hIgG1-013-B10')

'/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-B10_VL79.abi'

In [41]:
M_gb_abi_vh

Unnamed: 0,Match,gb_id,h3_name,sample_id,gb_fp,vh_abi_fp,vl_abi_fp
0,GB_Matched,VH-TMH577-hF-004-G09,TMH577-hF-004-G09,TMH577-hIgG1-013-A4_A04,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-004-G09.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-A4_A04_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-A4_A04_VL79.abi
1,GB_Matched,VH-TMH577-hF-005-G08,TMH577-hF-005-G08,TMH577-hIgG1-014-A8_A08,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-005-G08.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-A8_A08_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-A8_A08_VL79.abi
2,GB_Matched,VH-TMH577-hF-005-G08,TMH577-hF-005-G08,TMH577-hIgG1-014-C4_C04,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-005-G08.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-C4_C04_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-C4_C04_VL79.abi
3,GB_Matched,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-012-A02.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-F12_VL79.abi
4,GB_Matched,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-014-G9_G09,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-012-A02.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-G9_G09_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-G9_G09_VL79.abi
...,...,...,...,...,...,...,...
178,GB_Matched,VH-TMH577-hF-017-H02,TMH577-hF-017-H02,TMH577-hIgG1-013-B12,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-017-H02.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-B12_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-B12_VL79.abi
179,GB_Matched,VH-TMH577-hF-017-H02,TMH577-hF-017-H02,TMH577-hIgG1-014-H4_H04,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-017-H02.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-H4_H04_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-H4_H04_VL79.abi
180,GB_Matched,VH-TMH577-hF-017-H03,TMH577-hF-017-H03,TMH577-hIgG1-013-A2_A02,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-017-H03.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-A2_A02_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-A2_A02_VL79.abi
181,GB_Matched,VH-TMH577-hF-017-H03,TMH577-hF-017-H03,TMH577-hIgG1-013-C10,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-017-H03.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-C10_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-C10_VL79.abi


In [42]:
M_gb_abi_vh.head()

Unnamed: 0,Match,gb_id,h3_name,sample_id,gb_fp,vh_abi_fp,vl_abi_fp
0,GB_Matched,VH-TMH577-hF-004-G09,TMH577-hF-004-G09,TMH577-hIgG1-013-A4_A04,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-004-G09.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-A4_A04_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-A4_A04_VL79.abi
1,GB_Matched,VH-TMH577-hF-005-G08,TMH577-hF-005-G08,TMH577-hIgG1-014-A8_A08,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-005-G08.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-A8_A08_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-A8_A08_VL79.abi
2,GB_Matched,VH-TMH577-hF-005-G08,TMH577-hF-005-G08,TMH577-hIgG1-014-C4_C04,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-005-G08.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-C4_C04_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-C4_C04_VL79.abi
3,GB_Matched,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-012-A02.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-F12_VL79.abi
4,GB_Matched,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-014-G9_G09,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-012-A02.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-G9_G09_GATC-VH60-2617917.abi,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-G9_G09_VL79.abi


### Modified function - rerun

In [45]:
def run_gb_alignment_and_filtering_v2(M_gb_abi_vx, 
                                   df_vx, 
                                   vx_template_gb_dict, 
                                   vx_seq_start_a, 
                                   vx_seq_end_a, 
                                   par_match, 
                                   par_missmatch, 
                                   par_open, 
                                   par_extend, 
                                   par_filter_thresh, 
                                   log,
                                   log_msg=True,
                                   is_data_vl=False):
    filtered_res = []
    for e,i in enumerate(range(M_gb_abi_vx.shape[0])[0:]):
        gbid = M_gb_abi_vx.iloc[i].gb_id
        gbfp = M_gb_abi_vx.iloc[i].gb_fp
        sample_id = M_gb_abi_vx.iloc[i].sample_id
        _dfvx_ss = df_vx[df_vx.sample_id == M_gb_abi_vx.iloc[i].sample_id]
        for e2, abx in enumerate(range(_dfvx_ss.shape[0])):
            # genbank
            seqA = SeqIO.read(vx_template_gb_dict.get(gbid), 'gb')
            # vl
            flag = ''
            if is_data_vl:
                flag = 'VL'
                seqB = get_seqobj_from_abi(_dfvx_ss.iloc[abx].vl_abi_fp)
                seqB_vxinit = _dfvx_ss.iloc[abx]["vl_inti_sr"].seq
                seqB_vx, quality = get_trimmed_seq_record(seqB, get_quality=True)
                seqB_vx = get_seq_from_record(seqB_vx, reverse=True)
            else:
                flag = 'VH'                
                seqB_vx = _dfvx_ss.iloc[abx].vh_sr_seq_r
                seqB_vxinit = _dfvx_ss.iloc[abx].vh_init_sr.seq
                quality = _dfvx_ss.iloc[abx].vh_sr_tqlst
            # match between gb and vl-abi
            try:          
                for a in align(seqA.seq, seqB_vx, match=par_match, mismatch=par_missmatch, gap_open=par_open, gap_extend=par_extend):
                    score = int(a[2])
                    if score >= int(par_filter_thresh):
                        # getting the aligned seqA and seqB
                        aligned_seq_a = a[0]
                        aligned_seq_a = aligned_seq_a.replace('-', '')
                        aligned_seq_a_start = aligned_seq_a[0:6]
                        aligned_seq_a_length = len(aligned_seq_a) - 6
                        aligned_seq_a_end = aligned_seq_a[aligned_seq_a_length:len(aligned_seq_a)]
                        aligned_seq_b = a[1]
                        # futher filtering of seqB
                        if '-' not in aligned_seq_b:
                            if vx_seq_start_a in aligned_seq_a_start:
                                if vx_seq_end_a in aligned_seq_a_end:
                                    aligned_seq = a[0]
                                    aligned_seq = aligned_seq.replace('-', '')
                                    aligned_seq = Seq(aligned_seq)
                                    aligned_seq_rev = aligned_seq.reverse_complement()
                                    dna_string = str(seqB_vxinit)
                                    aligned_seq_rev = str(aligned_seq_rev)
                                    start = re.search(aligned_seq_rev, dna_string).start()
                                    end = re.search(aligned_seq_rev, dna_string).end()
                                    quality = quality[start:end]
                                    quality_score = statistics.mean(quality)
                                    lowest_quality = min(quality)
                                    # print(f"{e} | {abx} | {gbid} | {_dfvx_ss.iloc[abx].h3_name} | {gbfp} | {_dfvx_ss.iloc[abx].vh_abi_fp} | {score}| {quality_score} | {lowest_quality}")
                                    log.info(f"[+] FILTERED_GOODMATCH: {e}|{abx}|{flag}|{gbid}|{_dfvx_ss.iloc[abx].h3_name}|{sample_id}|{gbfp}|{_dfvx_ss.iloc[abx].vh_abi_fp}|{score}|{quality_score}|{lowest_quality}")if log_msg else None
                                    filtered_res.append([flag, gbid, _dfvx_ss.iloc[abx].h3_name, sample_id, gbfp, _dfvx_ss.iloc[abx].vh_abi_fp, score, quality_score, lowest_quality])
                    else:
                        log.info(f"[+] FILTERED_BADMATCH: {e}|{abx}|{flag}|{gbid}|{_dfvx_ss.iloc[abx].h3_name}|{sample_id}|{gbfp}|{_dfvx_ss.iloc[abx].vh_abi_fp}|{score}|---|---")if log_msg else None
            except:
                pass
    if not len(filtered_res) == 0:
        dfres = pd.DataFrame(filtered_res)
        dfres.columns = ["Orient", "gbid", "H3_name", "sample_id", "GB_FP", "ABI_FP", "Score", "Quality_score", "Low_quality"]
        return dfres
    else:
        None

In [47]:
vh_gb_abi_match_filtered = run_gb_alignment_and_filtering_v2(M_gb_abi_vh, df_vh, vh_template_gb_dict, vh_seq_start_a, vh_seq_end_a, par_match, par_missmatch, par_open, par_extend, par_filter_thresh, log, log_msg=True, is_data_vl=False)

In [48]:
vl_gb_abi_match_filtered = run_gb_alignment_and_filtering_v2(M_gb_abi_vl, df_vh, vl_template_gb_dict, vl_seq_start_a, vl_seq_end_a, par_match, par_missmatch, par_open, par_extend, par_filter_thresh, log, log_msg=True, is_data_vl=True)

In [49]:
log.info(f"\nMERGING DATAFRAMES")

In [50]:
vh_gb_abi_match_filtered.columns = [ 'VH_'+i  if i not in ['Orient','gbid','H3_name', 'sample_id', 'GB_FP'] else i  for i in vh_gb_abi_match_filtered.columns ]
vl_gb_abi_match_filtered.columns = [ 'VL_'+i  if i not in ['Orient','gbid','H3_name', 'sample_id', 'GB_FP'] else i  for i in vl_gb_abi_match_filtered.columns ]

In [51]:
vh_gb_abi_match_filtered.head()

Unnamed: 0,Orient,gbid,H3_name,sample_id,GB_FP,VH_ABI_FP,VH_Score,VH_Quality_score,VH_Low_quality
0,VH,VH-TMH577-hF-004-G09,TMH577-hF-004-G09,TMH577-hIgG1-013-A4_A04,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-004-G09.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-A4_A04_GATC-VH60-2617917.abi,782,60.792839,43
1,VH,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-012-A02.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,842,61.114014,43
2,VH,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-012-B03.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,812,60.70936,37
3,VH,VH-TMH577-hF-012-C03,TMH577-hF-012-C03,TMH577-hIgG1-014-E1_E01,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-012-C03.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-014-E1_E01_GATC-VH60-2617917.abi,824,48.830303,19
4,VH,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VH/VH-TMH577-hF-012-C04.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,776,60.390313,33


In [52]:
vl_gb_abi_match_filtered.head()

Unnamed: 0,Orient,gbid,H3_name,sample_id,GB_FP,VL_ABI_FP,VL_Score,VL_Quality_score,VL_Low_quality
0,VL,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VL/VL-TMH577-hF-012-A02.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,740,61.027027,42
1,VL,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VL/VL-TMH577-hF-012-B03.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,740,61.308108,47
2,VL,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VL/VL-TMH577-hF-012-B03.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,740,61.218919,49
3,VL,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VL/VL-TMH577-hF-012-C04.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,746,60.898123,38
4,VL,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-G8_G08,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/Template/VL/VL-TMH577-hF-012-C04.gb,/Users/rp/Desktop/purge/ny-immuno-sanger/data02/AB1_seqs/TMH577-hIgG1-013-G8_G08_GATC-VH60-2617917.abi,746,61.021448,42


In [53]:
vh_gb_abi_match_filtered.shape

(103, 9)

In [54]:
vl_gb_abi_match_filtered.shape

(114, 9)

In [55]:
df_sid_abi = pd.concat(
    [pd.DataFrame([ [i, vh_abi_dict.get(i).split("/")[-1], vl_abi_dict.get(i).split("/")[-1]] for i in vh_gb_abi_match_filtered.sample_id.to_list()], 
             columns=["sample_id", "VH", "VL"]),
    pd.DataFrame([ [i, vh_abi_dict.get(i).split("/")[-1], vl_abi_dict.get(i).split("/")[-1]] for i in vl_gb_abi_match_filtered.sample_id.to_list()], 
             columns=["sample_id", "VH", "VL"])], axis=0).drop_duplicates().reset_index(drop=True)

In [56]:
dfx1 = pd.merge(vh_gb_abi_match_filtered,df_sid_abi,on='sample_id')
dfx2 = pd.merge(vl_gb_abi_match_filtered,df_sid_abi,on='sample_id')
dfx1_ss = dfx1[['gbid','H3_name','sample_id', 'VH_Score', 'VH_Quality_score', 'VH_Low_quality', 'VH', 'VL']]
dfx2_ss = dfx2[['gbid','H3_name','sample_id', 'VL_Score', 'VL_Quality_score', 'VL_Low_quality', 'VH', 'VL']]
dfy = pd.merge(dfx1_ss, dfx2_ss, on="sample_id", how='inner')

In [57]:
dfx1_ss.head()

Unnamed: 0,gbid,H3_name,sample_id,VH_Score,VH_Quality_score,VH_Low_quality,VH,VL
0,VH-TMH577-hF-004-G09,TMH577-hF-004-G09,TMH577-hIgG1-013-A4_A04,782,60.792839,43,TMH577-hIgG1-013-A4_A04_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A4_A04_VL79.abi
1,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,842,61.114014,43,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi
2,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,812,60.70936,37,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi
3,VH-TMH577-hF-012-C03,TMH577-hF-012-C03,TMH577-hIgG1-014-E1_E01,824,48.830303,19,TMH577-hIgG1-014-E1_E01_GATC-VH60-2617917.abi,TMH577-hIgG1-014-E1_E01_VL79.abi
4,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,776,60.390313,33,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi


In [58]:
dfx2_ss.head()

Unnamed: 0,gbid,H3_name,sample_id,VL_Score,VL_Quality_score,VL_Low_quality,VH,VL
0,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,740,61.027027,42,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi
1,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-A7_A07,740,61.308108,47,TMH577-hIgG1-013-A7_A07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-A7_A07_VL79.abi
2,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,740,61.218919,49,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi
3,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,746,60.898123,38,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi
4,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-G8_G08,746,61.021448,42,TMH577-hIgG1-013-G8_G08_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G8_G08_VL79.abi


In [59]:
dfy.head()

Unnamed: 0,gbid_x,H3_name_x,sample_id,VH_Score,VH_Quality_score,VH_Low_quality,VH_x,VL_x,gbid_y,H3_name_y,VL_Score,VL_Quality_score,VL_Low_quality,VH_y,VL_y
0,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,842,61.114014,43,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,740,61.027027,42,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi
1,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,812,60.70936,37,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,740,61.218919,49,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi
2,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,776,60.390313,33,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,746,60.898123,38,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi
3,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-014-E8_E08,776,61.092784,37,TMH577-hIgG1-014-E8_E08_GATC-VH60-2617917.abi,TMH577-hIgG1-014-E8_E08_VL79.abi,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,746,61.340483,51,TMH577-hIgG1-014-E8_E08_GATC-VH60-2617917.abi,TMH577-hIgG1-014-E8_E08_VL79.abi
4,VH-TMH577-hF-012-C05,TMH577-hF-012-C05,TMH577-hIgG1-013-E9_E09,794,60.697201,28,TMH577-hIgG1-013-E9_E09_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E9_E09_VL79.abi,VL-TMH577-hF-012-C05,TMH577-hF-012-C05,746,60.747989,38,TMH577-hIgG1-013-E9_E09_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E9_E09_VL79.abi


In [60]:
dfz = dfy.copy()
dfz["mean_error_prob"] = [cal_mean_error_prob(i[0], i[1]) for i in zip(dfy.VH_Low_quality.tolist(), dfy.VL_Low_quality.to_list())]

In [61]:
dfz

Unnamed: 0,gbid_x,H3_name_x,sample_id,VH_Score,VH_Quality_score,VH_Low_quality,VH_x,VL_x,gbid_y,H3_name_y,VL_Score,VL_Quality_score,VL_Low_quality,VH_y,VL_y,mean_error_prob
0,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,842,61.114014,43,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi,VL-TMH577-hF-012-A02,TMH577-hF-012-A02,740,61.027027,42,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi,0.000057
1,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,812,60.709360,37,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi,VL-TMH577-hF-012-B03,TMH577-hF-012-B03,740,61.218919,49,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi,0.000106
2,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,776,60.390313,33,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,746,60.898123,38,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi,0.000330
3,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-014-E8_E08,776,61.092784,37,TMH577-hIgG1-014-E8_E08_GATC-VH60-2617917.abi,TMH577-hIgG1-014-E8_E08_VL79.abi,VL-TMH577-hF-012-C04,TMH577-hF-012-C04,746,61.340483,51,TMH577-hIgG1-014-E8_E08_GATC-VH60-2617917.abi,TMH577-hIgG1-014-E8_E08_VL79.abi,0.000104
4,VH-TMH577-hF-012-C05,TMH577-hF-012-C05,TMH577-hIgG1-013-E9_E09,794,60.697201,28,TMH577-hIgG1-013-E9_E09_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E9_E09_VL79.abi,VL-TMH577-hF-012-C05,TMH577-hF-012-C05,746,60.747989,38,TMH577-hIgG1-013-E9_E09_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E9_E09_VL79.abi,0.000872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,VH-TMH577-hF-017-G04,TMH577-hF-017-G04,TMH577-hIgG1-013-E5_E05,830,60.920482,43,TMH577-hIgG1-013-E5_E05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E5_E05_VL79.abi,VL-TMH577-hF-017-G04,TMH577-hF-015-D08,740,60.316384,42,TMH577-hIgG1-013-E5_E05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E5_E05_VL79.abi,0.000057
117,VH-TMH577-hF-017-G04,TMH577-hF-017-G04,TMH577-hIgG1-013-E5_E05,830,60.920482,43,TMH577-hIgG1-013-E5_E05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E5_E05_VL79.abi,VL-TMH577-hF-017-G04,TMH577-hF-017-G04,740,60.316384,42,TMH577-hIgG1-013-E5_E05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E5_E05_VL79.abi,0.000057
118,VH-TMH577-hF-017-G10,TMH577-hF-017-G10,TMH577-hIgG1-013-B7_B07,770,60.821530,32,TMH577-hIgG1-013-B7_B07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-B7_B07_VL79.abi,VL-TMH577-hF-017-G10,TMH577-hF-017-G10,746,61.016086,43,TMH577-hIgG1-013-B7_B07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-B7_B07_VL79.abi,0.000341
119,VH-TMH577-hF-017-H03,TMH577-hF-017-H03,TMH577-hIgG1-013-C10,770,60.742857,33,TMH577-hIgG1-013-C10_GATC-VH60-2617917.abi,TMH577-hIgG1-013-C10_VL79.abi,VL-TMH577-hF-017-H03,TMH577-hF-017-H03,746,61.163539,42,TMH577-hIgG1-013-C10_GATC-VH60-2617917.abi,TMH577-hIgG1-013-C10_VL79.abi,0.000282


In [69]:
dfz = dfz[["gbid_x", "H3_name_x", "sample_id", "VH_Score", "VH_Quality_score", "VH_Low_quality", "VL_Score","VL_Quality_score", "VL_Low_quality", "mean_error_prob", "VH_x", "VL_x"]]

In [73]:
dfz.drop_duplicates()

Unnamed: 0,gbid_x,H3_name_x,sample_id,VH_Score,VH_Quality_score,VH_Low_quality,VL_Score,VL_Quality_score,VL_Low_quality,mean_error_prob,VH_x,VL_x
0,VH-TMH577-hF-012-A02,TMH577-hF-012-A02,TMH577-hIgG1-013-F12,842,61.114014,43,740,61.027027,42,0.000057,TMH577-hIgG1-013-F12_GATC-VH60-2617917.abi,TMH577-hIgG1-013-F12_VL79.abi
1,VH-TMH577-hF-012-B03,TMH577-hF-012-B03,TMH577-hIgG1-013-G7_G07,812,60.709360,37,740,61.218919,49,0.000106,TMH577-hIgG1-013-G7_G07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-G7_G07_VL79.abi
2,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-013-D5_D05,776,60.390313,33,746,60.898123,38,0.000330,TMH577-hIgG1-013-D5_D05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-D5_D05_VL79.abi
3,VH-TMH577-hF-012-C04,TMH577-hF-012-C04,TMH577-hIgG1-014-E8_E08,776,61.092784,37,746,61.340483,51,0.000104,TMH577-hIgG1-014-E8_E08_GATC-VH60-2617917.abi,TMH577-hIgG1-014-E8_E08_VL79.abi
4,VH-TMH577-hF-012-C05,TMH577-hF-012-C05,TMH577-hIgG1-013-E9_E09,794,60.697201,28,746,60.747989,38,0.000872,TMH577-hIgG1-013-E9_E09_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E9_E09_VL79.abi
...,...,...,...,...,...,...,...,...,...,...,...,...
110,VH-TMH577-hF-017-G04,TMH577-hF-015-D08,TMH577-hIgG1-013-E5_E05,830,60.920482,43,740,60.316384,42,0.000057,TMH577-hIgG1-013-E5_E05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E5_E05_VL79.abi
114,VH-TMH577-hF-017-G04,TMH577-hF-017-G04,TMH577-hIgG1-013-E5_E05,830,60.920482,43,740,60.316384,42,0.000057,TMH577-hIgG1-013-E5_E05_GATC-VH60-2617917.abi,TMH577-hIgG1-013-E5_E05_VL79.abi
118,VH-TMH577-hF-017-G10,TMH577-hF-017-G10,TMH577-hIgG1-013-B7_B07,770,60.821530,32,746,61.016086,43,0.000341,TMH577-hIgG1-013-B7_B07_GATC-VH60-2617917.abi,TMH577-hIgG1-013-B7_B07_VL79.abi
119,VH-TMH577-hF-017-H03,TMH577-hF-017-H03,TMH577-hIgG1-013-C10,770,60.742857,33,746,61.163539,42,0.000282,TMH577-hIgG1-013-C10_GATC-VH60-2617917.abi,TMH577-hIgG1-013-C10_VL79.abi


In [72]:
with pd.ExcelWriter('final_results.xlsx', engine='xlsxwriter') as writer:
    dfz.to_excel(writer, sheet_name='final_res_mean_error_prob', index=False)
    # M_gb_abi_vh.to_excel(writer, sheet_name='ID matched gb and vh abi', index=False)
    
#     df_vh.to_excel(writer, sheet_name='H3_probe_matched_VH_and_VL', index=False)
#     res_df_copy.to_excel(writer, sheet_name='VH and VL Copied Files', index=False)
    
#     vh_nts.to_excel(writer, sheet_name='vh_gb_start_end_nts', index=False)
#     vl_nts.to_excel(writer, sheet_name='vl_gb_start_end_nts', index=False)
#     vh_gb_abi_match_filtered.to_excel(writer, sheet_name='VH results', index=False)
#     vl_gb_abi_match_filtered.to_excel(writer, sheet_name='VL results', index=False)