In [76]:
# Generate filtered subset of file in FASTA format
filestr = "pdb70_full.fasta"
filterstr = "g-protein"
outstr = "pdb70_filter_" + filterstr.replace(" ", "") + ".fasta"

# Read up to 1GB of data
bytes_max = 1000000000

with open(filestr,"r") as f:
    lines = f.readlines(bytes_max)
    
outfile = ""
seq_max = 4000
seq_count = 0
for i in range(len(lines)):
    if lines[i].lower().find(filterstr.lower()) != -1:
        outfile += lines[i]
        outfile += lines[i+1]
        seq_count += 1
    if seq_count == seq_max:
        break

with open(outstr,"w") as f:
    f.write(outfile)
    
print("File with", seq_count, "sequences written to", outstr)

File with 181 sequences written to pdb70_filter_g-protein.fasta


In [None]:
# To convert FASTA to A3M, use the HHblits toolkit: https://toolkit.tuebingen.mpg.de/tools/hhblits
# Make sure you remove the #A3M# header line before (or after) you put it into ColabFold or it will break!

In [75]:
# Calculate average pLDDT from zip file
import zipfile
import json
import numpy as np

zipstr = "C:\\Users\\rainh\\Downloads\\transport_protein_mafft_ca73a.result.zip"

z = zipfile.ZipFile(zipstr)

outscores = []
outstd = []

for scores in [f for f in z.namelist() if f.find("scores.json") != -1]:
    with z.open(scores) as file:
        j = json.loads(file.readline())
        outscores.append(np.mean(j['plddt']))
        outstd.append(np.std(j['plddt']))

print("Model averages:", outscores)
print("Model stdevs:", outstd)
print("Mean pLDDT:", "{:.2f}".format(np.mean(outscores)), "±", "{:.2f}".format(np.mean(outstd)))

Model averages: [86.38197860962566, 86.20048128342245, 85.53844919786097, 84.9022459893048, 82.88689839572193]
Model stdevs: [18.40006954298565, 18.42004814231379, 16.65895969851431, 16.869512362897428, 17.46827287469974]
Mean pLDDT: 85.18 ± 17.56


In [72]:
# Calculate average PAE from zip file
import zipfile
import json
import numpy as np

zipstr = "C:\\Users\\rainh\\Downloads\\test4_kalign_3e556.result.zip"

z = zipfile.ZipFile(zipstr)

outscore = 0
outstd = 0

for scores in [f for f in z.namelist() if f.find("predicted_aligned_error_v1.json") != -1]:
    with z.open(scores) as file:
        j = json.loads(file.readline())
        outscore = np.mean(j[0]['distance'])
        outstd = np.std(j[0]['distance'])
        
print("Mean PAE:", "{:.2f}".format(outscore), "±", "{:.2f}".format(outstd))

Mean PAE: 12.08 ± 6.92
