In [1]:
import subprocess
from pathlib import Path
import re
import string 
import random

import drivers
import parsers

import pandas as pd

import plotly.express as px

from Bio import AlignIO
from Bio.PDB.PDBParser import PDBParser
import Bio.PDB.Dice as BPD

# functions

In [3]:
def get_neff(alignment):
    """
    Calculate the Neff of an MSA
    Neff is defined as the number of clusters at 90% ID over / 33% length
    """
    subprocess.run(f'fa_strict -l=0 {alignment} > tmp.afa',
                  shell = True,
                  check = True)
    
    p1 = subprocess.run(f'run_mmclust tmp.afa -w=0 -s=0.9 -c=.33 -lin | cut -f2 | sort | uniq | wc -l',
               shell = True,
                text = True,
                check = True,
               capture_output = True
                )
    
    neff = p1.stdout
    
    Path('tmp.afa').unlink()
    
    return neff
    
def colabfold_logfile_to_table(logfile, output):
    
    with open(output, 'w') as o:
        with open(logfile) as f:
            for line in f:
                if 'Query' in line:
                    jobname = line.strip().split()[4]

                if 'took' in line:
                    model = line.strip().split()[2]

                    time_raw = line.strip().split()[4]
                    time = time_raw.split(".")[0]

                    pLDDT = line.strip().split()[7]
                    print(jobname,model,time,pLDDT, sep = ",", file = o)
                    
def colabfold_logtable_to_df(table):
    
    df = pd.read_csv(table, 
                    header = None, 
                    names = ['profile', 
                             'model', 
                             'time', 
                             'score']
                    )
    return df

In [4]:
def get_top_ranked_models(af_output_dir):
    """
    Searches through a directory containing *.pdb files produced from alphafold/colabfold
    Selects the top ranked file based on the presence of "rank_1" in the filename
    Returns a list
    """
    files = []
    for file in Path(af_output_dir).rglob('*.pdb'):
        if "rank_1" in str(file):
            files.append(file)
    return files

def pdb_id_generator(size=4, chars=string.ascii_uppercase + string.digits):
    """
    Generates a random 4 character string
    from: 
    https://stackoverflow.com/questions/2257441/random-string-generation-with-upper-case-letters-and-digits
    """
    return ''.join(random.choice(chars) for _ in range(size))

def dali_import_private_structure(filelist, output):
    """
    Expects a list of file path objects
    Runs import.pl, assigns a random PDB ID to each file
    """
    output_dir = Path(output)
    file_to_pdb_lookup_table = output_dir / Path('pdbids.csv')
    
    with open(file_to_pdb_lookup_table, 'w') as o:
        for file in filelist:
            
            file_base = re.sub("_unrelaxed_model_._rank_1", "", file.stem)

            pdbid = pdb_id_generator()

            subprocess.run(f'import.pl --pdbfile {file} --pdbid {pdbid} --dat {output}',
                           shell = True,
                           check = True
                          )

            print(file_base,
                  file.stem, 
                  pdbid, 
                  sep = ",",
                  file = o)
            
def parse_dali(dalioutput):
    """
    Gets the top 9 hits from a dali 'summary' output
    Clearly, needs improvement..
    """
    data = []
    with open(dalioutput) as f:
        for line in f:
            if line.startswith("# Query:"):
                query = line.strip().split()[2]
            if line.startswith("   "):
                hit_no, chain, z, rmsd, lali, nres, identity, description = line.strip().split(maxsplit=7)
                data.append({"query_id" : query,
                             "chain" : chain,
                             "z" : z,
                             "rmsd" : rmsd,
                             "lali" : lali,
                             "nres" : nres,
                             "identity" : identity,
                             "description" : description})
    df = pd.DataFrame.from_records(data)
    return df

In [5]:
def slice_pdb(pdbid, pdbfile, start, end, output):
    """
    Accepts a PDBfile and start/end coordinates
    Outputs a new file with just the structure between start-end
    
    End can be larger than length of the molecule
    """
    parser = PDBParser()
    structure = parser.get_structure(pdbid, pdbfile)
    BPD.extract(structure, 'A', start, end, output)

I tried adding this code on line 1016 in `batch.py`, but it didn't work
```
summary_file = result_dir / Path('summary.txt')
if summary_file.exists():
    summary_file.unlink()

with summary_file.open('a') as f:
    for model, score_dict in outs.items():
        for metric, scores in score_dict.items():
            print(jobname,model,metric,scores, sep = ",", file = f)
```

# MSAs

## CAS I-B

Kira sent me a table of MSAs corresponding to CAS I-B (`/profiles/casIB/casIBmetadata.csv`). I obtained the sequences from her directory:
>for id in `cut -f1 casIBmetadata.csv -d","`; do cp /net/frosty/vol/export1/proteome/cascl19/projFTP/profiles/${id}.FASTA .; done

Calculate the Neff

In [None]:
with open('profiles/casIB/neff.csv', 'w') as o:
    for file in Path('profiles/casIB/').rglob('*.FASTA'):
        neff = get_neff(file)
        print(file.stem, neff, sep = ",", file = o)

In [None]:
Make A3Ms.

In [None]:
for file in Path('profiles/casIB/').rglob('*.FASTA'):
    a3m = file.with_suffix('.a3m')
    !reformat.pl fas a3m -M first {file} {a3m}

## Natalya

Natalya sent me 75 MSAs of unknown origin. Each MSA has the consensus (*.afac) or not (*.afa).

In [None]:
with open('profiles/natalya/neff.csv', 'w') as o:
    for file in Path('profiles/natalya/').rglob('*.afa'):
        neff = get_neff(file)
        print(file.stem, neff, sep = ",", file = o)  


In [None]:
Make A3Ms.

In [None]:
for file in Path('profiles/natalya/').rglob('*.afac'):
    a3m = file.with_suffix('.a3m')
    !reformat.pl fas a3m -M first {file} {a3m}

## crAssphage

I downloaded `278` the profiles from crassphages from natalyas FTP site. All of them have consensus:
>wget -m -e robots=off --no-parent  ftp.ncbi.nih.gov/pub/yutinn/crassfamily_2020/profiles/

The longest is `VP02650` at 1608 positions.

Calculate the Neff

In [None]:
with open('profiles/crass/neff.csv', 'w') as o:
    for file in Path('profiles/crass/').rglob('*.afac'):
        neff = get_neff(file)
        print(file.stem, neff, sep = ",", file = o)

In [None]:
Make A3Ms.

In [None]:
for file in Path('profiles/crass/').rglob('*.afac'):
    a3m = file.with_suffix('.a3m')
    !reformat.pl fas a3m -M first {file} {a3m}

## Hypervariable ORFs

In my gutphage paper, I found DGRs targeting genes. Get ones that have unknown function, and filter the MSA b/c I didn't use a good coverage criteria.

In [None]:
df_dgr = pd.read_csv('/home/benlersm/projects/human_virome_project/DGR/DGRs_hhpred.csv').rename(columns = {'query' : 'prot_id'})
prot_ids = df_dgr.query('prob < 60').prot_id.tolist()
for prot_id in prot_ids:
    a3m = Path(f'/home/benlersm/projects/human_virome_project/DGR/rt_contigs/VR_orfs/{prot_id}.a3m')
    #!cp /home/benlersm/projects/human_virome_project/DGR/rt_contigs/VR_orfs/{prot_id}.a3m profiles/dgr/
    !hhfilter -i {a3m} -o profiles/dgr/{prot_id}.a3m -id 100 -cov 90 -qid 33
    !reformat.pl a3m fas profiles/dgr/{prot_id}.a3m profiles/dgr/{prot_id}.afa

delete the smaller alignment files
```
find . -name "*.a3m" -size -10k  -delete
find . -name "*.afa" -size -10k  -delete
```

manually restore 2 a3m files


In [None]:
with open('profiles/dgr/neff.csv', 'w') as o:
    for file in Path('profiles/dgr/').rglob('*.afa'):
        neff = get_neff(file)
        print(file.stem, neff, sep = ",", file = o)

## Metadata

In [37]:
df_list = []
for file in Path('profiles/').rglob('neff.csv'):
    df = pd.read_csv(file, header = None, names = ["profile", "neff"])
    df_list.append(df)
df_neff = pd.concat(df_list)

Get the length of each MSA using Biopython http://biopython.org/DIST/docs/tutorial/Tutorial.html#:~:text=You%E2%80%99ll-,notice,-in%20the%20above

In [38]:
data = []
for file in Path('profiles/casIB/').rglob('*.FASTA'):
    alignment = AlignIO.read(file, "fasta").get_alignment_length()
    data.append({"profile" : file.stem,
                 "length" : alignment})
    #print(file.stem,alignment) 

for file in Path('profiles/').rglob('*.afac'):
    alignment = AlignIO.read(file, "fasta").get_alignment_length()
    data.append({"profile" : file.stem,
                 "length" : alignment})
    
for file in Path('profiles/').rglob('*.afa'):
    alignment = AlignIO.read(file, "fasta").get_alignment_length()
    data.append({"profile" : file.stem,
                 "length" : alignment})
    

df_len = pd.DataFrame.from_records(data).drop_duplicates()

In [39]:
df_metadata = pd.merge(df_neff, df_len, how = 'left', on = 'profile')

# Experiments

## Select MSAs for analysis

Get deep MSAs (Neff > 20), and ~40 short and shallow MSAs (len < 200) 

In [None]:
df_deep = df_metadata.query('neff > 20')
profile_list_neff20 = df_deep.profile.tolist()

df_shallow = df_metadata.query('neff <= 20 and length < 150')
profile_list_shallow = df_shallow.profile.tolist()
profile_list = profile_list_neff20 + profile_list_shallow
len(profile_list)
#profile_list_neff20

In [None]:
for profile in profile_list:

    msa = list(Path('profiles/').rglob(f'{profile}.a3m'))[0]
    if not Path(msa).exists():
        print(msa)
    
    !cp {msa} input/exp1/

## Exp 1: Number of models versus PAE

Test if running with 1 or 5 models does anything (5 is the max).

Make an input folder

run colabfold

```
CUDA_VISIBLE_DEVICES=0 colabfold_batch --num-models 1 input/exp1/ output/exp1/models1/ &> /dev/null
CUDA_VISIBLE_DEVICES=1 colabfold_batch --num-models 5 input/exp1/ output/exp1/models5/
```



## Exp 2 : Number of recycles

The results from exp1 show that adding 5 models increases confidence by ~0.3 on average, so I will just use one model from here on out to speed things up. In my dataset of 63 MSAs with Neff > 20, it took ~3 hours to fold these with all other parameters set as default.

Test explicitly setting the number of recycles.

```
CUDA_VISIBLE_DEVICES=0 colabfold_batch --num-models 1 --num-recycle 10 input/exp1/ output/exp2/recyle10/
CUDA_VISIBLE_DEVICES=1 colabfold_batch --num-models 1 --num-recycle 5 input/exp1/ output/exp2/recyle5/
CUDA_VISIBLE_DEVICES=2 colabfold_batch --num-models 1 --num-recycle 3 input/exp1/ output/exp2/recyle3/
```

# Results

## Exp 1

In [131]:
colabfold_logfile_to_table('output/exp1/models1/log.txt', 'output/exp1/models1/log.csv')
df_model1 = (colabfold_logtable_to_df('output/exp1/models1/log.csv')
            .rename(columns = {"score" : "score_model1"}
                   )
            )

            
#df_model1["num_models"] = 1
df_model1.sample()

Unnamed: 0,profile,model,time,score_model1
80,VP02569,model_3,0,66.4


In [132]:
#models5 is accidently in models10 folder
colabfold_logfile_to_table('output/exp1/models10/log.txt', 'output/exp1/models10/log.csv')
df_model5 = (colabfold_logtable_to_df('output/exp1/models10/log.csv')
            .sort_values('score', ascending = False)
             .drop_duplicates('profile')
             .drop(columns = ['model', 'time'])
            .rename(columns = {"score" : "score_model5"}
            
                   )
            )
df_model5.sample()

Unnamed: 0,profile,score_model5
430,VP02663,81.8


In [133]:
df = pd.merge(df_model1, df_model5, how = 'left', on = 'profile')
df["delta_5_models_vs_1_model"] = df["score_model5"] - df["score_model1"]

df2 = pd.merge(df, df_neff, how = 'left', on = 'profile')
df2.sample()

Unnamed: 0,profile,model,time,score_model1,score_model5,delta_5_models_vs_1_model,neff
40,VP00411,model_3,0,86.1,86.1,0.0,24


In [None]:
df2

In [None]:
px.scatter(df2, x = 'neff', y = 'score_model5', template = 'simple_white')

In [None]:
#px.scatter(df2, x= 'neff', y= 'score_model5', color = 'delta_5_models_vs_1_model')
px.box(df2, y = 'delta_5_models_vs_1_model', template = 'simple_white', width = 100)

In [154]:
#df2.sort_values('delta_5_models_vs_1_model', ascending = False)

## Exp 2

In [156]:
colabfold_logfile_to_table('output/exp2/recyle10/log.txt', 'output/exp2/recyle10/log.csv')
df_recycle10 = (colabfold_logtable_to_df('output/exp2/recyle10/log.csv'))         
df_recycle10["num_recycles"] = 10


colabfold_logfile_to_table('output/exp2/recycle5/log.txt', 'output/exp2/recycle5/log.csv')
df_recycle5 = (colabfold_logtable_to_df('output/exp2/recycle5/log.csv'))
df_recycle5["num_recycles"] = 5

colabfold_logfile_to_table('output/exp2/recycle3/log.txt', 'output/exp2/recycle3/log.csv')
df_recycle3 = (colabfold_logtable_to_df('output/exp2/recycle3/log.csv'))
df_recycle3["num_recycles"] = 3

df_recycle3.sample()

df_recycle = pd.concat([df_recycle3, df_recycle5, df_recycle10])

In [None]:
px.box(df_recycle, x = 'num_recycles', y = 'score', template = 'simple_white')

# DALI

## Install


The current version of DALI was not compiled using `make parallel`, because there is no `mpicompare` in the `bin/` directory of DALI:

```
mpirun was unable to launch the specified application as it could not access
or execute an executable:

Executable: /usr/local/DALI/5.0/bin/mpicompare
Node: sge897

while attempting to start process rank 0.
```

So, I reinstalled my own copy for now in the `hhsuite_db` folder.

However, I get a different openmpi error when trying to run in parallel:

```
# /usr/local/openmpi/1.4.1/bin/mpirun --np 10 /panfs/pan1.be-md.ncbi.nlm.nih.gov/hhsuite_db/DaliLite.v5/bin/mpicompare output/exp1/models10/dat/ output/exp1/models10/dat/ DALICON T < dalicon_input > /dev/null
[sge619:12640] [[8402,0],0] mca_oob_tcp_recv_handler: invalid message type: 15
[sge766:05511] [[8402,0],1] mca_oob_tcp_recv_handler: invalid message type: 15
[sge586:18066] [[8402,0],2] mca_oob_tcp_recv_handler: invalid message type: 15
[sge992:07001] [[8402,0],4] mca_oob_tcp_recv_handler: invalid message type: 15
[sge322:03494] [[8402,0],5] mca_oob_tcp_recv_handler: invalid message type: 15
[sge354:19330] [[8402,0],3] mca_oob_tcp_recv_handler: invalid message type: 15
[sge639:07790] [[8402,0],6] mca_oob_tcp_recv_handler: invalid message type: 15
[sge1049:06962] [[8402,0],7] mca_oob_tcp_recv_handler: invalid message type: 15
[sge1040:13885] [[8402,0],9] mca_oob_tcp_recv_handler: invalid message type: 15
[sge672:22537] [[8402,0],8] mca_oob_tcp_recv_handler: invalid message type: 15
```

## database Setup

DALI requires PDB files be converted into an internal format. There are a few ways to do this. Ultimately, I chose option 3, which using both option 1 and option 2 outputs. 

Basically, make a mirror of the PDB database using the `--rsync` option of `import.pl` (option 1). This will try to import every file in the PDB archive, but will crash. Then, get a list of cluster representatives from the PDB website (option 2). Use this list to run `import.pl` for the PDB archive.

In the future, try just getting a copy of the PDB archive without running `import.pl --rsync`. 

### Option 1

The first option is to run the `impoort.pl` script with `--rsync`, which downloads the entire PDB. For each PDB entry, it gets a `*.ent.gz`* file and then converts every single entry to the internal data format. I'll try this option first:
>import.pl --rsync --pdbmirrordir dbs/pdb/ --dat dbs/dali/dali_pdb/DAT/ --clean

However, I get this error:
```
Reading DSSP file
At line 288 of file ../src/util.f
Fortran runtime error: Bad integer for item 1 in list input
Error in puu: /usr/local/DALI/5.0/bin/puu 5a1vU 32240.tmp 5a1v.dssp
```

There is something wrong with one of the PDB chains that is breaking the automatic import. Once `import.pl` breaks, it creates a `dali.lock` file that prevents subsequent imports. The second problem is that it appears that `250000` files is the max that can be put into a folder on our system, and there are ~450000 PDB entries. 





### option 2

Rather than doing the whole PDB via `import.pl --rsync`, just do a subset. 

PDB regularly makes a subset clustered with MMSeqs2 at various thresholds, available [here](https://www.rcsb.org/docs/programmatic-access/file-download-services#:~:text=4HHB.A/download-,Sequence%20clusters%20data,-Results%20of%20the). I can download the PDB files using the PDB script `batch_download.sh`, available [here](https://www.rcsb.org/docs/programmatic-access/batch-downloads-with-shell-script). The problem with this approach is that **some pdb entries are too large to download the .pdb.gz file and would take ~1 day to complete**


In [None]:
#There are ~57k lines in bc-70.out
#each line is a cluster. I am assuming the first entry is the representative
#batch_download requires a file with a comma-separated list of IDs:

pdb_ids = []
with open('dbs/pdb70/bc-70.out') as f:
    for line in f:

        #get the first listed entry
        rep = line.strip().split()[0]

        #get the base PDB id
        pdb_id = re.sub("_.", "", rep)

        pdb_ids.append(pdb_id)
with open('dbs/pdb70/bc-70.csv', 'w') as o:
    print(",".join(pdb_ids), file = o)


Download the PDB files. This will take ~1day
>./batch_download.sh -f pdb70/bc-70.csv -o pdb70/ -p

Import the files **TBD**. Again, some files crash `import.pl` and create a `dali.lock` file, so i have to import them individually.

### Option 3

Option 2 is too slow. PDB provides an rsync script that mirrors the archive, available [here](https://www.rcsb.org/docs/programmatic-access/file-download-services#:~:text=the%20ftp%20protocol.-,Automated%20download%20of%20data,-The%20URLs%20in). My guess is this is what `import.pl --rysnc` uses.

So, use the `*.ent` files generated in option 1, and the list of representatives in `option 2`, to import the files.

Again, If I use `import.pl --pdblist pdb70_subset.list`, where `pdb70_subset.list` is a list of file paths to `*.ent.gz` files, I run into the same issue where a bad file crashes the program. So, I import individually.

In [61]:
mmreps = []

with open('dbs/dali/dali_pdb/bc-70.out') as f:
    for line in f:

        #get the first listed entry
        rep = line.strip().split()[0]

        #get the base PDB id. Double check everything is uppercase
        pdb_id = rep[0:4].upper()

        mmreps.append(pdb_id)
len(mmreps)

57256

Loop through the PDB directory containing all PDB files. IF the file is a representative, run `import.pl` to extract all the chains. 

A better way would be to loop through the representatives, find the file using the middle two characters of the PDB ID, and run import.pl. However, I already did it this way, so whatever.

In [None]:
i = 0
for file in Path('dbs/pdb/').rglob('*.ent.gz'):

    pdb_id = file.stem.strip('pdb').strip('.ent')

    if pdb_id.upper() in mmreps:
        i += 1
        
        #check if the file has been imported already
        datfile_gen = Path(f'dbs/dali/dali_pdb/pdb70/').glob(f'{pdb_id}*.dat')
        if len(list(datfile_gen)) < 1:
        
            print(f"importing {pdb_id}")
            subprocess.run(f'import.pl --pdbfile {file} --pdbid {pdb_id} --dat dbs/dali/dali_pdb/pdb70/ --clean',
                         shell = True,
                         #check = True
                          )

            if Path('dali.lock').exists():
                print(f"{pdb_id} crashed import.pl, skipping")
                Path('dali.lock').unlink()

Import.pl extracts all chains from a PDB file, but only some of them are actually representatives. Remove the non-representative DAT files.

In [89]:
mmreps = []
with open('dbs/dali/dali_pdb/bc-70.out') as f:
    for line in f:

        #get the first listed entry
        rep = line.strip().split()[0]

        #Double check everything is uppercase
        mmreps.append(rep.upper())

i=0
files_to_delete = []
for file in Path('dbs/dali/dali_pdb/pdb70/').rglob('*.dat'):
    pdb_id = file.stem[0:4]
    chain = file.stem[4:]
    pdb_id_with_chain = (pdb_id + "_" + chain).upper()
    
    if not pdb_id_with_chain in mmreps:
        files_to_delete.append(file)
    else:
        i+=1
print(f'there are {len(mmreps)} representatives')
print(f'Of those, there {i} that have a matching .DAT file')
print(f'will delete {len(files_to_delete)} files')

there are 57256 representatives
Of those, there 47893 that have a matching .DAT file
will delete 94855 files


In [90]:
for file in files_to_delete:
    file.unlink()

Note the discrepancy between the number of MMcluste representatives and the number of matching DAT files. 

I checked a couple of PDB IDs. It appears that `import.pl` doesn't produce a DATfile for some of them (E.g., 1a11). On the dali website, chains shorter than 30 AA are excluded, so it might be because of this. 

Also, if I using the PDB `batch_download.sh` script to download PDB files individually, I end up with `46828` files, not `~57000`. 

So, I don't think there is something seriously wrong.

Make a tarball of the directory
>tar -czvf pdb70datfiles.tar.gz pdb70/ 

Make a list of the DAT files to constrain the search

In [92]:
with open('dbs/dali/dali_pdb/pdb70.list', 'w') as o:
    for file in Path('dbs/dali/dali_pdb/pdb70/').rglob('*.dat'):
        print(file.stem, file = o)

## Run

### Using openmpi

```
#make a swarmfile
echo DaliLite.v5/bin/dali.pl --cd1 T43NA --dat1 output/exp1/models10/dat/ --dat2 dbs/dali/dali_pdb/pdb70/ --clean --oneway --hierarchical --repset small.list --db small.list --np 10 --MPIRUN_EXE /usr/local/openmpi/1.4.1/bin/mpirun > test.swarm

#make a SGEfarm submit .sh script


#edit the .sh file, and add the following line
#$ -pe openmpi 10

#submit
qsub job.submitSGEfarm.68050.1.csh
```

### Using the command line

An example command:
>dali.pl --cd1 T43NA --dat1 output/exp1/models10/dat/ --dat2 dbs/dali/dali_pdb/pdb70/ --clean --oneway --hierarchical --repset pdb70.list --db pdb70.list --outfmt "summary,alignments"

- `cd1` : Name of the (fake) PDB ID of the .DAT file
- `dat1` : location of the .DAT file. If using a PDB file as a query instead, this is where a temporary .DAT file is made (defaults to `./DAT`, throws an error if this dir isn't present
- `dat2` : location of the database .DAT files
- `repset` : only consider a subset of files
- `db` : I'm not sure if this is needed or not when using `repset`.

Other params

-`pdbid1` : name of the output file (defaults to mol1A)

The program makes a lot of temporary files, so I am not sure I can parallelize using a swarmfile. And, given the current problems with `openmpi`, i can only use 1 thread. In a test run, it took ~50 mins for 1 query against hte pdb70 subset.

I think one workaround is to use BioWulf


### Using BioWulf

Make a mini sbatch command on CBBDev machine

In [7]:
#Get the alphafold scores
colabfold_logfile_to_table('output/exp1/models10/log.txt', 'output/exp1/models10/log.csv')
df_model10 = colabfold_logtable_to_df('output/exp1/models10/log.csv')
            
df_model10.sample()

Unnamed: 0,profile,model,time,score
37,VP00399,model_5,0,77.9


In [88]:
#Make DAT files for the top structures
top_models = get_top_ranked_models('output/exp1/models10/')
dali_import_private_structure(top_models, 'output/exp1/models10/dat/')

In [8]:
df_dali = pd.read_csv('output/exp1/models10/dat/pdbids.csv', 
                      header = None, 
                      names = ["profile",
                               "modelfile",
                               "pdbid"
                              ]
                     )
df_dali.sample()

Unnamed: 0,profile,modelfile,pdbid
65,VP02626,VP02626_unrelaxed_model_4_rank_1,BNUW


In [9]:
df_dali2 = pd.merge(df_dali,
                    df_model10,
                    how = 'left',
                    on = 'profile')
df_dali2.head(6)

Unnamed: 0,profile,modelfile,pdbid,model,time,score
0,VP02761,VP02761_unrelaxed_model_3_rank_1,X7WL,model_3,0,94.5
1,VP02761,VP02761_unrelaxed_model_3_rank_1,X7WL,model_4,0,94.3
2,VP02761,VP02761_unrelaxed_model_3_rank_1,X7WL,model_5,0,94.0
3,VP02761,VP02761_unrelaxed_model_3_rank_1,X7WL,model_1,0,92.4
4,VP02761,VP02761_unrelaxed_model_3_rank_1,X7WL,model_2,0,92.9
5,VP02726,VP02726_unrelaxed_model_3_rank_1,A9HD,model_3,0,91.3


In [10]:
df_dali3 = (df_dali2.sort_values('score', ascending = False)
                    .drop_duplicates('profile')
                    .query('score > 70')
           )
df_dali3.shape

(76, 6)

Make a batch file

In [169]:
with open('dali.sh', 'w') as o:
    for pdbid in df_dali3.pdbid.tolist():
            print(f'dali.pl --cd1 {pdbid}A --dat1 output/exp1/models10/dat/ --dat2 dbs/dali/dali_pdb/pdb70/ --clean --oneway --hierarchical --repset pdb70.list --db pdb70.list --outfmt "summary,alignments" --np $SLURM_NTASKS --MPIRUN_EXE /usr/local/OpenMPI/4.1.1/gcc-9.2.0/bin/mpirun',
                  file = o)

sbatch --partition=multinode --constraint=x2650 --ntasks=64 --ntasks-per-core=1 --time=08:00:00 --qos=turbo --exclusive dali.sh

It took 20 mins to run one protein

## Results 

In [6]:
df_list = []
for file in Path('output_dali/').rglob('*.txt'):
    df = parse_dali(file)
    df_list.append(df)
df2 = pd.concat(df_list).reset_index(drop = True)

In [13]:
df3 = df2.sort_values('z', ascending = False)#.drop_duplicates('query_id')

#add A to the PDB ID chain so it matches DALI output
df_dali3["pdbid2"] = df_dali3["pdbid"].astype('str') + str("A")

df4 = pd.merge(df3,
               df_dali3, 
               how = 'left', 
               left_on = 'query_id', 
               right_on = 'pdbid2')
#df4.to_csv('dali_results.csv', index = False)

In [None]:
df_metadata.sample()

In [22]:
(df4.query('profile.str.contains("cas") or profile.str.contains("Cas")')
     .sort_values(["profile", 
                   "z"], 
                  ascending = [False,False])
     .loc[:, ["profile", "score", "query_id", "chain", "z", "rmsd", "lali", "nres", "identity", "description"]]
     .to_csv('Cas8.csv', index = False)
)

# Structure metadata

It would be highly desirable to have domains mapped onto known/predicted structures. There are a couple of strategies I can think of:

1. The Pfam FTP offers a mapping of Pfam domains --> known PDB structures [here](http://ftp.ebi.ac.uk/pub/databases/Pfam/mappings/pdb_pfam_mapping.txt). There are `~75700` mappings. The drawback is this is only pfam domains and known structures.
2. The Pfam FTP has a file of for RosettaFold  [here](http://ftp.ebi.ac.uk/pub/databases/Pfam/RoseTTAfold_aln/Pfam35.0.tar.gz). However, it is just a3m-formatted MSAs of each pfam entry.
3. On the 'AlphaFold structures' tab of a given Pfam, there is a list of proteins "that match this family and have AlphaFold structures". This is nice, because it includes **predicted** structures in the AlphaFold database. However, there is no mapping of the coordinates of the pfam domain onto the predicted structure and I don't know how to download this data. I think I would first have to get a table of Pfam-->Uniprot ProteinID mappings, then get ProteinID mappings --> AlphaFold structure. To do the former, there is a [SQL](https://sparql.uniprot.org/sparql) query interface, but I don't know how to do construct the query, so I contacted the helpdesk.
4. Entrez offers a [linkname](https://www.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html) to go from CDD-->known PDB structure. However, I can't find the coordinates of the domain on the structure, and it only includes known PDB structures.
