In [22]:
import pandas as pd
import numpy as np
import os
import sys
import gzip
import shutil
import json
import re
import tmalign_mapping
import Bio.PDB as bpdb
from Bio import Align

# Zip Status

In [2]:
# Zip status
zipped_status_path = "/nfs/turbo/lsa-tewaria/zipped_status.csv"
if not os.path.exists(zipped_status_path):
    list_mmcifs = os.listdir("/nfs/turbo/lsa-tewaria/mmCIF/")
    zipped_status = pd.DataFrame({"status" : [True for i in range(len(list_mmcifs))]}, index = list_mmcifs)
    zipped_status.to_csv(zipped_status_path)
else:
    zipped_status = pd.read_csv(zipped_status_path, index_col = 0)

# Uniprot to pdb mappings

In [3]:
# Load dataframe PDBs to Uniprot
uniprot_pdb_path = "/nfs/turbo/lsa-tewaria/uniprot_df_small.csv"
uniprot_df = pd.read_csv(uniprot_pdb_path)

# Scop data

In [4]:
# Load Scop data
scop_df = tmalign_mapping.read_scop_file()

# Single Uniprot ID - Naive Version. All to All TM align

In [5]:
# # Move PDBs to Uniprot folders
uniprot_path, zipped_status = tmalign_mapping.uniprot_to_pdb("CHEY_ECOLI", uniprot_df, zipped_status)

# Compare proteins in Uniprot folder
tmalign_mapping.compare_proteins_dir(uniprot_path)

# Clean TMalign output
tmalign_mapping.clean_TMalign_output(uniprot_path)

# Save zipped status
zipped_status.to_csv(zipped_status_path)

KeyboardInterrupt: 

# All Uniprot IDs - Naive version

**NOTE: DON'T RUN THE BELOW CODE CHUNK. IT WILL TAKE A LONG TIME**

In [None]:
# Get Unique Uniprot IDs
uniprot_ids = uniprot_df["uniprot_id"].unique()  
for uniprot_id in uniprot_ids:
    # Find Uniprot data
    uniprot_json = tmalign_mapping.find_uniprot(uniprot_df, uniprot_id)

    # Move PDBs to Uniprot folders
    uniprot_path, zipped_status = tmalign_mapping.uniprot_to_pdb(uniprot_id, uniprot_df, zipped_status)

    # Compare proteins in Uniprot folder
    tmalign_mapping.compare_proteins_dir(uniprot_path)

    # Clean TMalign output
    tmalign_mapping.clean_TMalign_output(uniprot_path)
    

# Single Uniprot ID - Smart versions

In [7]:
uniprot_json = tmalign_mapping.find_uniprot(uniprot_df, "CO5_HUMAN")
domain_info = tmalign_mapping.get_domain_info_for_pdbs(uniprot_json, scop_df)
uniprot_path, zipped_status = tmalign_mapping.uniprot_to_pdb("CO5_HUMAN", uniprot_df, zipped_status)
tmalign_mapping.compare_proteins_domain("/nfs/turbo/lsa-tewaria/uniprot/CO5_HUMAN", domain_info)

  tmalign_output = pd.read_csv(os.path.join(directory, raw_file), sep = "\t", skipfooter = 1)
  tmalign_output = pd.read_csv(os.path.join(directory, raw_file), sep = "\t", skipfooter = 1)
  tmalign_output = pd.read_csv(os.path.join(directory, raw_file), sep = "\t", skipfooter = 1)


# All Uniprot IDs - Smart version

**NOTE: DON'T RUN THE BELOW CODE CHUNK. IT WILL TAKE A LONG TIME**

In [None]:
# Get Unique Uniprot IDs
uniprot_ids = uniprot_df["uniprot_id"].unique()  

for uniprot_id in uniprot_ids:
    # Find Uniprot data
    uniprot_json = tmalign_mapping.find_uniprot(uniprot_df, uniprot_id)

    # Get the uniprot domain info
    domain_info = tmalign_mapping.get_domain_info_for_pdbs(uniprot_json, scop_df)

    # Move PDBs to Uniprot folders
    uniprot_path, zipped_status = tmalign_mapping.uniprot_to_pdb(uniprot_id, uniprot_df, zipped_status)

    # Compare proteins in Uniprot folder
    tmalign_mapping.compare_proteins_domain(uniprot_path, domain_info)

# Checking the zip function 

In [5]:
def zip_cif_folder(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".cif"):
            filename_new = os.path.join(directory, filename)
            os.remove(filename_new)

In [9]:
def zip_mmcif_dir(directory):
    for filename in os.listdir(directory):
        new_dir = os.path.join(directory, filename)
        if os.path.isdir(new_dir):
            zip_cif_folder(new_dir)

In [7]:
dir_path = "/nfs/turbo/lsa-tewaria/mmCIF/hq"
zip_cif_folder(dir_path)

In [10]:
zip_mmcif_dir("/nfs/turbo/lsa-tewaria/mmCIF")

# Single uniprot ID 

In [15]:
uniprot_json = tmalign_mapping.find_uniprot(uniprot_df, "CO5_HUMAN")
domain_info = tmalign_mapping.get_domain_info_for_pdbs(uniprot_json, scop_df)
uniprot_path= tmalign_mapping.uniprot_to_pdb_efficient("CO5_HUMAN", uniprot_df, write = False)
tmalign_mapping.compare_proteins_domain_efficient("/nfs/turbo/lsa-tewaria/uniprot/CO5_HUMAN", domain_info)

  tmalign_output = pd.read_csv(os.path.join(directory, raw_file), sep = "\t", skipfooter = 1)
  tmalign_output = pd.read_csv(os.path.join(directory, raw_file), sep = "\t", skipfooter = 1)
  tmalign_output = pd.read_csv(os.path.join(directory, raw_file), sep = "\t", skipfooter = 1)


In [37]:
def extract_sequence(json_file):
    return json_file["sequence"]["value"]

lol = tmalign_mapping.find_uniprot(uniprot_df, "CO5_HUMAN")
# extract_sequence(lol)
lol_2 = tmalign_mapping.find_uniprot(uniprot_df, "CO6_HUMAN")
extract_sequence(lol_2)

'MARRSVLYFILLNALINKGQACFCDHYAWTQWTSCSKTCNSGTQSRHRQIVVDKYYQENFCEQICSKQETRECNWQRCPINCLLGDFGPWSDCDPCIEKQSKVRSVLRPSQFGGQPCTAPLVAFQPCIPSKLCKIEEADCKNKFRCDSGRCIARKLECNGENDCGDNSDERDCGRTKAVCTRKYNPIPSVQLMGNGFHFLAGEPRGEVLDNSFTGGICKTVKSSRTSNPYRVPANLENVGFEVQTAEDDLKTDFYKDLTSLGHNENQQGSFSSQGGSSFSVPIFYSSKRSENINHNSAFKQAIQASHKKDSSFIRIHKVMKVLNFTTKAKDLHLSDVFLKALNHLPLEYNSALYSRIFDDFGTHYFTSGSLGGVYDLLYQFSSEELKNSGLTEEEAKHCVRIETKKRVLFAKKTKVEHRCTTNKLSEKHEGSFIQGAEKSISLIRGGRSEYGAALAWEKGSSGLEEKTFSEWLESVKENPAVIDFELAPIVDLVRNIPCAVTKRNNLRKALQEYAAKFDPCQCAPCPNNGRPTLSGTECLCVCQSGTYGENCEKQSPDYKSNAVDGQWGCWSSWSTCDATYKRSRTRECNNPAPQRGGKRCEGEKRQEEDCTFSIMENNGQPCINDDEEMKEVDLPEIEADSGCPQPVPPENGFIRNEKQLYLVGEDVEISCLTGFETVGYQYFRCLPDGTWRQGDVECQRTECIKPVVQEVLTITPFQRLYRIGESIELTCPKGFVVAGPSRYTCQGNSWTPPISNSLTCEKDTLTKLKGHCQLGQKQSGSECICMSPEEDCSHHSEDLCVFDTDSNDYFTSPACKFLAEKCLNNQQLHFLHIGSCQDGRQLEWGLERTRLSSNSTKKESCGYDTCYDWEKCSASTSKCVCLLPPQCFKGGNQLYCVKMGSSTSEKTLNICEVGTIRCANRKMEILHPGKCLA'

In [23]:
aligner = Align.PairwiseAligner()
aligner.mode = 'local'
aligner.open_gap_score = -11
aligner.extend_gap_score = -1
aligner.substitution_matrix = Align.substitution_matrices.load("BLOSUM62")

# Splitting pdb files with multiple chains and multiple p

In [38]:
def split_pdb_structure(uniprot_id):
    uniprot_path = os.path.join("/nfs/turbo/lsa-tewaria/uniprot/", uniprot_id)
    seq = extract_sequence(lol)
    for pdb_id in os.listdir(uniprot_path):
        if not pdb_id.endswith(".cif"):
            continue
        pdb_path = os.path.join(uniprot_path, pdb_id)
        pdb = bpdb.MMCIFParser().get_structure(pdb_id, pdb_path)
        if pdb_id != "4e0s.cif":
            continue
        for model in pdb:
            i = 0
            polypeptides = bpdb.PPBuilder().build_peptides(model)
            for chain, pp in zip(model, polypeptides):
                model_id = str(model.get_id())
                chain_id = str(chain.get_id())
                
                print(pp.get_sequence())
                alignment = aligner.align(pp.get_sequence(), seq)
                print(alignment[0].aligned)
                
                chain_path = os.path.join(uniprot_path, pdb_id[:-4] + "_"+ model_id + chain_id + ".pdb")
                print(chain_path)
                # if i == 4:
                #     alignment = aligner.align(pp.get_sequence(), extract_sequence(lol_2))
                #     print(alignment[0].aligned)
                i +=1
                # io = bpdb.PDBIO()
                # io.set_structure(chain)
                # io.save(chain_path)



In [39]:
split_pdb_structure("CO5_HUMAN")



EQTYVISAPKIFRVGASENIVIQVYGYTEAFDATISIKSYPDKKFSYSSGHVHLSSENKFQNSAILTIQPKQLPGGQNPVSYVYLEVVSKHFSKSKRMPITYDNGFLFIHTDKPVYTPDQSVKVRVYSLNDDLKPAKRETVLTFIDPEGSEVDMVEEIDHIGIISFPDFKIPSNPRYGMWTIKAKYKEDFSTTGTAYFEVKEYVLPHFSVSIEPEYNFIGYKNFKNFEITIKARYFYNKVVTEADVYITFGIREDLKDDQKEMMQTAMQNTMLINGIAQVTFDSETAVKELSYYSLEDLNNKYLYIAVTVIESTGGFSEEAEIPGIKYVLSPYKLNLVATPLFLKPGIPYPIKVQVKDSLDQLVGGVPVTLNAQTIDVNQETSDLDPSKSVTRVDDGVASFVLNLPSGVTVLEFNVKTDAPDLPEENQAREGYRAIAYSSLSQSYLYIDWTDNHKALLVGEHLNIIVTPKSPYIDKITHYNYLILSKGKIIHFGTREKFSDASYQSINIPVTQNMVPSSRLLVYYIVTGEQTAELVSDSVWLNIEEKCGNQLQVHLSPDADAYSPGQTVSLNMATGMDSWVALAAVDSAVYGVQRGAKKPLERVFQFLEKSDLGCGAGGGLNNANVFHLAGLTFLTNANADDSQENDEPCKEILRPR
(((0, 657),), ((19, 676),))
/nfs/turbo/lsa-tewaria/uniprot/CO5_HUMAN/4e0s_0A.pdb
SKPEIRSYFPESWLWEVHLVPRRKQLQFALPDSLTTWEIQGIGISNTGICVADTVKAKVFKDVFLEMNIPYSVVRGEQIQLKGTVYNYRTSGMQFCVKMSAVEGICTSES
(((0, 110),), ((760, 870),))
/nfs/turbo/lsa-tewaria/uniprot/CO5_HUMAN/4e0s_0B.pdb
KCVRQKVEGSSSHLVTFTVLPLEIGLHNINFSLETWFGKEILVKTLRVVPEGVKRESYSGVTLDPRGI



'MGLLGILCFLIFLGKTWGQEQTYVISAPKIFRVGASENIVIQVYGYTEAFDATISIKSYPDKKFSYSSGHVHLSSENKFQNSAILTIQPKQLPGGQNPVSYVYLEVVSKHFSKSKRMPITYDNGFLFIHTDKPVYTPDQSVKVRVYSLNDDLKPAKRETVLTFIDPEGSEVDMVEEIDHIGIISFPDFKIPSNPRYGMWTIKAKYKEDFSTTGTAYFEVKEYVLPHFSVSIEPEYNFIGYKNFKNFEITIKARYFYNKVVTEADVYITFGIREDLKDDQKEMMQTAMQNTMLINGIAQVTFDSETAVKELSYYSLEDLNNKYLYIAVTVIESTGGFSEEAEIPGIKYVLSPYKLNLVATPLFLKPGIPYPIKVQVKDSLDQLVGGVPVTLNAQTIDVNQETSDLDPSKSVTRVDDGVASFVLNLPSGVTVLEFNVKTDAPDLPEENQAREGYRAIAYSSLSQSYLYIDWTDNHKALLVGEHLNIIVTPKSPYIDKITHYNYLILSKGKIIHFGTREKFSDASYQSINIPVTQNMVPSSRLLVYYIVTGEQTAELVSDSVWLNIEEKCGNQLQVHLSPDADAYSPGQTVSLNMATGMDSWVALAAVDSAVYGVQRGAKKPLERVFQFLEKSDLGCGAGGGLNNANVFHLAGLTFLTNANADDSQENDEPCKEILRPRRTLQKKIEEIAAKYKHSVVKKCCYDGACVNNDETCEQRAARISLGPRCIKAFTECCVVASQLRANISHKDMQLGRLHMKTLLPVSKPEIRSYFPESWLWEVHLVPRRKQLQFALPDSLTTWEIQGVGISNTGICVADTVKAKVFKDVFLEMNIPYSVVRGEQIQLKGTVYNYRTSGMQFCVKMSAVEGICTSESPVIDHQGTKSSKCVRQKVEGSSSHLVTFTVLPLEIGLHNINFSLETWFGKEILVKTLRVVPEGVKRESYSGVTLDPRGIYGTISRRKEFPYRIPLDLVPKTEIKRILSVKGLLVGEILSAVLSQEGINI

'MGLLGILCFLIFLGKTWGQEQTYVISAPKIFRVGASENIVIQVYGYTEAFDATISIKSYPDKKFSYSSGHVHLSSENKFQNSAILTIQPKQLPGGQNPVSYVYLEVVSKHFSKSKRMPITYDNGFLFIHTDKPVYTPDQSVKVRVYSLNDDLKPAKRETVLTFIDPEGSEVDMVEEIDHIGIISFPDFKIPSNPRYGMWTIKAKYKEDFSTTGTAYFEVKEYVLPHFSVSIEPEYNFIGYKNFKNFEITIKARYFYNKVVTEADVYITFGIREDLKDDQKEMMQTAMQNTMLINGIAQVTFDSETAVKELSYYSLEDLNNKYLYIAVTVIESTGGFSEEAEIPGIKYVLSPYKLNLVATPLFLKPGIPYPIKVQVKDSLDQLVGGVPVTLNAQTIDVNQETSDLDPSKSVTRVDDGVASFVLNLPSGVTVLEFNVKTDAPDLPEENQAREGYRAIAYSSLSQSYLYIDWTDNHKALLVGEHLNIIVTPKSPYIDKITHYNYLILSKGKIIHFGTREKFSDASYQSINIPVTQNMVPSSRLLVYYIVTGEQTAELVSDSVWLNIEEKCGNQLQVHLSPDADAYSPGQTVSLNMATGMDSWVALAAVDSAVYGVQRGAKKPLERVFQFLEKSDLGCGAGGGLNNANVFHLAGLTFLTNANADDSQENDEPCKEILRPRRTLQKKIEEIAAKYKHSVVKKCCYDGACVNNDETCEQRAARISLGPRCIKAFTECCVVASQLRANISHKDMQLGRLHMKTLLPVSKPEIRSYFPESWLWEVHLVPRRKQLQFALPDSLTTWEIQGVGISNTGICVADTVKAKVFKDVFLEMNIPYSVVRGEQIQLKGTVYNYRTSGMQFCVKMSAVEGICTSESPVIDHQGTKSSKCVRQKVEGSSSHLVTFTVLPLEIGLHNINFSLETWFGKEILVKTLRVVPEGVKRESYSGVTLDPRGIYGTISRRKEFPYRIPLDLVPKTEIKRILSVKGLLVGEILSAVLSQEGINI