In [58]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm 
import glob
import re

## Inspect TEMPURA dataset

In [53]:
tempura = pd.read_csv("../../data/thermo/200617_TEMPURA.csv")

In [54]:
tempura.head()

Unnamed: 0,genus_and_species,taxonomy_id,strain,superkingdom,phylum,class,order,family,genus,assembly_or_accession,Genome_GC,Genome_size,16S_accssion,16S_GC,Tmin,Topt_ave,Topt_low,Topt_high,Tmax,Tmax_Tmin
0,Methanopyrus kandleri,2320,116,Archaea,Euryarchaeota,Methanopyri,Methanopyrales,Methanopyraceae,Methanopyrus,,,,AB301476,67.7,85.0,100.0,,,122.0,37.0
1,"""Geogemma barossii""",1927912,121,Archaea,Crenarchaeota,Thermoprotei,Desulfurococcales,Pyrodictiaceae,Geogemma,,,,,,85.0,106.0,,,121.0,36.0
2,Pyrolobus fumarii,54252,1A,Archaea,Crenarchaeota,Thermoprotei,Desulfurococcales,Pyrodictiaceae,Pyrolobus,GCA_000223395.1,54.9,1.8,X99555,68.4,90.0,106.0,,,113.0,23.0
3,Pyrococcus kukulkanii,1609559,NCB100,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Pyrococcus,GCA_001577775.1,44.6,2.0,CP010835,66.5,70.0,105.0,,,112.0,42.0
4,Methanopyrus kandleri,2320,AV19,Archaea,Euryarchaeota,Methanopyri,Methanopyrales,Methanopyraceae,Methanopyrus,GCA_000007185.1,61.2,1.7,NR_074539,68.1,84.0,98.0,,,110.0,38.0


In [55]:
tempura.groupby("superkingdom").count().iloc[:,0:2]

Unnamed: 0_level_0,genus_and_species,taxonomy_id
superkingdom,Unnamed: 1_level_1,Unnamed: 2_level_1
Archaea,549,549
Bacteria,8090,8090


In [56]:
tempura.loc[:,["superkingdom","Genome_GC","16S_GC","Tmin","Tmax","Tmax_Tmin"]].groupby("superkingdom").agg(['mean', 'max','min'])

Unnamed: 0_level_0,Genome_GC,Genome_GC,Genome_GC,16S_GC,16S_GC,16S_GC,Tmin,Tmin,Tmin,Tmax,Tmax,Tmax,Tmax_Tmin,Tmax_Tmin,Tmax_Tmin
Unnamed: 0_level_1,mean,max,min,mean,max,min,mean,max,min,mean,max,min,mean,max,min
superkingdom,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Archaea,53.742007,71.2,25.7,59.073905,69.1,50.6,32.590528,90.0,-2.5,60.095993,122.0,17.0,27.490893,54.0,8.0
Bacteria,53.830217,77.8,23.9,55.123957,68.3,45.5,14.651916,70.0,-20.0,40.629419,100.0,10.0,25.978541,60.0,4.0


In [57]:
tempura.loc[tempura["taxonomy_id"]==760142,:]

Unnamed: 0,genus_and_species,taxonomy_id,strain,superkingdom,phylum,class,order,family,genus,assembly_or_accession,Genome_GC,Genome_size,16S_accssion,16S_GC,Tmin,Topt_ave,Topt_low,Topt_high,Tmax,Tmax_Tmin


## Inspect istable2.0 data

In [23]:
istable_train = pd.read_csv("../../data/thermo/istable_S3568_training.txt",sep = '\t')

In [24]:
istable_train.head()

Unnamed: 0,PDB,CHAIN,WDTYPE,PDBPOSITION,SEQPOSITION,MUTANT,TEMPERATURE,PH,DDG
0,1e+65,A,V,5,5,A,25.0,7.0,0.0
1,1e+65,A,I,7,7,A,25.0,7.0,-3.11
2,1e+65,A,I,7,7,S,26.5,7.3,-3.44
3,1e+65,A,I,20,20,A,25.0,7.0,-1.56
4,1e+65,A,I,20,20,T,25.0,7.0,-2.39


In [25]:
istable_test = pd.read_csv("../../data/thermo/istable_S630.txt",sep = '\t')
istable_test.head()

Unnamed: 0,PDB,chain,WT,Position,Seq_position,Mutation,Temperature,pH,actual_ddG,actual_lable
0,1AG2,A,M,129,6,V,25.0,7.0,-0.33,Decrease
1,1AG2,A,V,180,57,I,25.0,7.0,-0.5,Decrease
2,1AG2,A,T,183,60,A,25.0,7.0,-4.62,Decrease
3,1AG2,A,T,190,67,V,25.0,7.0,-0.17,Decrease
4,1AG2,A,F,198,75,S,25.0,7.0,-2.46,Decrease


## Inspect Thermo Prot dataset
- The temperature of thermophilic proteins in this dataset was set to above 60°C and the temperature of non-thermophilic proteins was set to be less than 30°C

In [33]:
records = SeqIO.parse("../../data/thermo/thermo_prot_nonthermophilic_2.fasta", "fasta")
count = SeqIO.write(records, "../../data/thermo/thermo_prot_nonthermophilic_2.tab", "tab")
print("Converted %i records" % count)
thermo_prot_nonthermophilic_2 = pd.read_csv("../../data/thermo/thermo_prot_nonthermophilic_2.tab",sep = '\t',header=None)
thermo_prot_nonthermophilic_2.columns = ["id", "seq"]
thermo_prot_nonthermophilic_2.head()

Converted 101 records


Unnamed: 0,id,seq
0,low1-A0A0L6TX14,KKIHLYTDGGCRGNGKETENLGAIGGVLIYPEKNIEKEYKRAYENT...
1,low2-A0A015MM15,MKDVQVYTDGACSGNPGPGGWGAVLFYGTSRKEISGGEPMTTNNRM...
2,low3-B6EJV2,MITEIMKQVEIFTDGSCLGNPGPGGYGIVMRYKGTEKTFSGGFNQT...
3,low4-A0A090IPY5,MAQKYYVVWEGREIGIFTTWAQCKAQVDKFAGARYKSFPSLVEAES...
4,low5-A0A090KKN9,MITEIMKQVEIFTDGSCLGNPGPGGYGIVMRYKGTEKTFAEGFNQT...


In [32]:
records = SeqIO.parse("../../data/thermo/thermo_prot_thermophilic_2.fasta", "fasta")
count = SeqIO.write(records, "../../data/thermo/thermo_prot_thermophilic_2.tab", "tab")
print("Converted %i records" % count)
thermo_prot_thermophilic_2 = pd.read_csv("../../data/thermo/thermo_prot_thermophilic_2.tab",sep = '\t',header=None)
thermo_prot_thermophilic_2.columns = ["id", "seq"]
thermo_prot_thermophilic_2.head()

Converted 106 records


Unnamed: 0,id,seq
0,Th1:PF00075-D5WTG9,MKEVTIYTDGACSGNPGPGGWAAVLLYGDHVREMAGGEEHTTNQRM...
1,Th2:PF00075-A0A0K8QLT7,MSDEPRVVHAFTDGACLGNPGPGGWAALLRWNGHERLLRGAEAATT...
2,Th3:PF00075-F2NQD4,MTRPYVEVYTDGSADRFGRGGWAALLRYEGREKLLSGGEAATTNNR...
3,Th4:PF00075-H2J660,MFIDFLYDFDLTKDQSFVAEELDKFLYNQKDFLIVQGSAGTGKTFL...
4,Th5:PF00075-R7RR27,MKEVTIYTDGGCRGNGKEAAVGGYGCVLIYKDKIKEIKKGFENTTN...


## Inspect ProThermDat dataset
- for pdt_y: 1 is thermophilic, 0 is otherwise

In [3]:
pdt_X = np.load("../../data/thermo/pdt_X_unique_huge.npy")
pdt_y = np.load("../../data/thermo/pdt_y_unique_huge.npy",allow_pickle=True)
pdt_header = np.load("../../data/thermo/pdt_header_unique_huge.npy",allow_pickle=True)

In [23]:
pdt_X[1]

array([11,  1, 15, 12,  5, 10, 17,  8,  1, 10, 13,  9,  6,  9, 10, 10, 17,
        3, 16, 10,  4,  1, 10, 17,  9,  8,  6,  8,  4,  2, 14,  4, 18, 16,
        4,  4, 16, 15,  9, 10, 18,  5, 13, 10,  4,  9,  1, 14,  1, 14,  8,
        8,  8,  2, 15, 13, 17,  3,  8, 13, 17,  5, 18,  4, 20,  6,  1,  1,
        3,  8,  6,  5, 18,  6,  9,  3, 17, 10, 10,  4,  4, 12,  9,  3, 18,
       18,  4, 10, 10,  3, 10,  6,  5,  6, 20,  2, 15,  5, 18, 18,  1, 11,
       13,  4,  4,  9, 18, 13, 13, 15, 10, 13,  3,  6, 15,  5,  3, 10, 16,
        6, 10, 12,  7, 14, 15, 18,  1, 17,  9,  5, 13, 15, 18,  1,  4,  3,
        5,  5, 15,  4, 14,  6, 11, 14, 18, 10, 13,  8,  9, 10,  7,  6, 12,
        8,  4, 10,  1, 13, 15, 18,  6, 10,  1,  4, 11,  8, 18,  3,  8, 18,
       16, 17,  6, 17, 17, 10, 15, 14, 12,  9, 10, 18,  4,  8,  1, 13,  8,
       10,  4,  1, 17, 17, 15, 10,  8,  1, 12, 15, 18,  1, 20, 15, 11,  9,
        7,  4, 15,  8, 12,  4, 10, 17,  4,  9, 10, 15, 14, 10, 10,  6,  9,
        1, 13,  0,  0,  0

In [4]:
print(pdt_X.shape)
print(pdt_y.shape)
print(pdt_header)

(8185520, 600)
(8185520,)
[['A0A0H2LRH6' 'PF00027']
 ['Q24QI9' 'PF01634']
 ['A0A0R1V9Z9' 'PF00925']
 ...
 ['S7UC58' 'PF02653']
 ['F2NQC7' 'PF03167']
 ['A0A0H5RJV2' 'PF01202']]


In [5]:
IUPAC_Extended_Dic_Transf = {"A":1,"C":2,"D":3,"E":4,"F":5,"G":6,"H":7,"I":8,"K":9,"L":10,"M":11,"N":12,"P":13,"Q":14,"R":15,"S":16,"T":17,"V":18,"W":19,"Y":20,"B":21,"X":21,"Z":21,"J":21,"U":21,"O":21}

In [6]:
tok_to_aa = [(b,a) for (a,b) in IUPAC_Extended_Dic_Transf.items()]
tok_to_aa.append((0,''))
tok_to_aa = dict(tok_to_aa)

In [7]:
# convert tokenized aa sequence back to real sequence
''.join([tok_to_aa[x] for x in pdt_X[0,:]])

'MPHGNLNQHNVLGGSIKDRVRPPSAAELDGIPWLPTLTPAERRRAEAALVVGEAEVGDLVCRVGRSPTYWFGVVEGLLKMSNDNADGGSVTYTGVPPGGWFGEGTVMKREPYRYNIQALRRSVVAGLPIESFHWLLDHSIGFNRFVMNQLNERLGQFIAALEIDRLNNPDARVARNLVSLFNPVLYPGVGEVLRITQQELAYLVGLSRQRVNEALNGLSAEGLIRVEYGGLRVLDLPGLRATAMSNKKNNRSPETENP'

In [8]:
# get the domains from the superfamily belonging to the motor proteins
p_loop_gtpase =  "PF06414;PF06564;PF07015;PF02367;PF02534;PF06309;PF05621;PF00265;PF06068;PF02223;PF00685;PF00448;PF02463;PF01202;PF00158;PF10443;PF03215;PF00485;PF00519;PF06431;PF01057;PF10609;PF00931;PF05729;PF00488;PF03205;PF09140;PF01078;PF00493;PF08433;PF07693;PF01695;PF01745;PF01715;PF00693;PF00625;PF00437;PF01580;PF00142;PF01935;PF05872;PF05673;PF01712;PF06144;PF02224;PF06418;PF07931;PF02492;PF01121;PF01656;PF00308;PF03668;PF00006;PF02374;PF03308;PF01637;PF01583;PF03969;PF00406;PF00709;PF00005;PF08298;PF07728;PF07726;PF07724;PF00004;PF05707;PF11496;PF10649;PF10412;PF10236;PF09820;PF09818;PF09547;PF09439;PF09037;PF08423;PF07755;PF07652;PF07088;PF06990;PF06733;PF05970;PF05894;PF05879;PF05876;PF05783;PF05127;PF04670;PF04466;PF04257;PF03976;PF03796;PF03567;PF03354;PF02689;PF02606;PF02572;PF02499;PF02399;PF01268;PF00350;PF00225;PF00063;PF10662;PF00271;PF00910;PF05496;PF02562;PF00025;PF05049;PF03266;PF01591;PF00071;PF04548;PF00009;PF00176;PF07517;PF03237;PF00735;PF04665;PF02263;PF01926;PF00580;PF00270;PF09848;PF06745;PF04851;PF01443;PF03193;PF00503;PF06858;PF02283;PF02456;PF00154;PF03029;PF08477;PF02421;PF12696;PF07999;PF04310;PF05272;PF06048;PF12774;PF12775;PF12780;PF04317;PF12846;PF11398;PF13086;PF13087;PF13166;PF13173;PF13175;PF13177;PF13189;PF10923;PF13191;PF08303;PF13207;PF13238;PF13245;PF13304;PF13307;PF13361;PF13401;PF13469;PF12128;PF13479;PF13476;PF13481;PF13500;PF13514;PF13521;PF13538;PF13555;PF13558;PF13604;PF13614;PF13654;PF13671;PF13872;PF14516;PF14532;PF14617;PF05625;PF05179;PF16203;PF14417;PF02500;PF13337;PF11602;PF16575;PF16796;PF13871;PF02702;PF10088;PF03846;PF17213;PF12848;PF12399;PF14396;PF10996;PF09711;PF11111;PF08351;PF03192;PF02841;PF05609;PF08438;PF12344;PF12781;PF03028;PF16813;PF16834;PF16836;PF07034;PF10483;PF09807;PF03618;PF17784;PF18128;PF18133;PF07529;PF18747;PF18748;PF18751;PF18766;PF18082;PF19044;PF19263;"
tubulin_binding = "PF10644;PF14881;PF13809;PF00091;"
tubulin_c = "PF03953;PF12327;"
actin_like = "PF06406;PF00480;PF02541;PF00814;PF06723;PF05378;PF01968;PF00012;PF03727;PF00349;PF02685;PF01150;PF03630;PF00370;PF02782;PF06277;PF02543;PF03309;PF01869;PF00022;PF00871;PF03702;PF08841;PF07318;PF05134;PF11104;PF13941;PF14450;PF09989;PF06050;PF17003;PF14574;PF17788;PF17989;"

In [9]:
p_loop_gtpase = p_loop_gtpase.split(";")[0:-1]
tubulin_binding = tubulin_binding.split(";")[0:-1]
tubulin_c = tubulin_c.split(";")[0:-1]
actin_like = actin_like.split(";")[0:-1]
motors_related = p_loop_gtpase + tubulin_binding + tubulin_c + actin_like

In [15]:
in_p_loop_gtpase = pd.Series(pdt_header[:,1]).isin(p_loop_gtpase)
in_tubulin_binding = pd.Series(pdt_header[:,1]).isin(tubulin_binding)
in_tubulin_c = pd.Series(pdt_header[:,1]).isin(tubulin_c)
in_actin_like = pd.Series(pdt_header[:,1]).isin(actin_like)

In [19]:
in_motors_related = pd.Series(pdt_header[:,1]).isin(motors_related)

In [22]:
pdt_X_motor = pdt_X[in_motors_related,:]
pdt_y_motor = pdt_y[in_motors_related]
pdt_header_motor = pdt_header[in_motors_related,:]

In [29]:
pdt_X_list = [pdt_X_motor[i] for i in range(pdt_X_motor.shape[0])]

In [30]:
# build up the dataframe 
pdt_motor = pd.DataFrame({"uniprot_id":pdt_header_motor[:,0], "pfam_id":pdt_header_motor[:,1],  'is_thermophilic': pdt_y_motor, "token":pdt_X_list})

In [31]:
pdt_motor.head()

Unnamed: 0,uniprot_id,pfam_id,is_thermophilic,token
0,K1INX9,PF00709,0,"[11, 16, 16, 8, 18, 18, 18, 6, 1, 12, 19, 6, 3..."
1,A9AVV2,PF00006,0,"[11, 1, 18, 17, 1, 4, 3, 8, 10, 16, 15, 10, 9,..."
2,L9Y3C9,PF13361,0,"[11, 8, 17, 17, 15, 2, 3, 3, 18, 18, 4, 3, 17,..."
3,A0A062ACI1,PF03618,0,"[11, 16, 4, 16, 9, 14, 5, 9, 15, 16, 18, 5, 5,..."
4,A0A102DZZ7,PF00480,0,"[11, 14, 12, 8, 18, 6, 8, 3, 8, 6, 6, 16, 7, 8..."


In [32]:
pdt_seq_motor = []
for i in range(pdt_motor.shape[0]):
    token = pdt_motor.iloc[i,3]
    if i%10000 == 0:
        print(i)
#         print(token)
    curr_seq = ''.join([tok_to_aa[x] for x in token])
    pdt_seq_motor.append(curr_seq)

0
[11 16 16  8 18 18 18  6  1 12 19  6  3  4  6  9  6 15  8 18  3 20 10  1
  6 14  1  6  1 16  8 15  5 14  6  6 12 12  1  6  7 17 18 18 12  3 10  6
 17  5  9 10  7 14 18 13 16  6 18  5 12 13  3  2 10 18 18 10  6 13  6 11
 18  8 16 13  4  9 10 17 18  4 10  4  4 18  9  1 16  6 18 17 13  9 10  1
  8 16  3 15  1 17 10  2 10 13 10  7  1 10  4  3 17 10  4  4 14 15 10  6
  3  6  1 20  6 16 17 15 14  6  8  1 13  1 20  6  3 15 18 11  9  9  1  8
 10 18  6 19 10  9 14 13  3 18 10 18  4 15  8 14  5 11 10  3 19  9 10 13
 14 11  9  1  8 20 13 16  5  4  5 17 14 17  1 14  4 11  1  3 19 10 10  4
 18 16  1 13 19  8  3  1 18  2 12 18 16 11 13 10  9  1 10 14  1  4  6  9
 17 10 10  5  4  1 14 10  6  1  6 15  3 10  8 20  6  4 20 13 19 18 17 16
 16  7 18 16  6  1 20  1  6  8  6  6  6 10 13  6 10 15 13  4 15 18  8  1
 18  1  9  1  5 16 16 16 18  6 17  6 17 10 10 17  1 11  4 12 14  3  4  5
 15  9  8 17 12  4  5  6  1 17 17  6 15 13 15  3 18  6 20  5  3  1 18  1
 17  9 12  6 18  4 10 14  1  1 17  4 18  1 10 17 

50000
[11 14  9 12  3  9  8  9 10 12 16 20 10  1 13 10 16  5 10 20  6  8  6  8
 19 10 15 12 15 10  5  3 19  6  8 10  7 16  4 14 20 16  8 13 18  8  2  8
  6 12 10 16 18  6  6 17  6  9 17 13  7 17  4 20  8  8 15 10 10  9  3  9
 20 15  8  1 18 10 16 15  6 20  9 15  9 17 16  6  5  8 10  1  3 16  4 16
 16 16 10  4  8  6  3  4 13  5 14 11  9 12  9  5 13  3  8 10 18  1 18  3
  1 12 15 15 15  6  8 14 12 10 10 16 10 13  4  9  3 15 13  4 18  8 10 10
  3  3  1 20 14  7 15 20 18  7 13 16 10 16  8 18 10 16  3 20  7 15 10  5
 20 12  3  9 10 11 13 17  6  7 10 15  4 13  8 16 12  8 12 15 17  3  8 18
 18 18 17  9  2  3  4  3 11  9 13  8  3  5 15 18  8  4  4 12 11  9 10 15
  1  7 14 10 10  5  5 17 16  8 18 20  6  4 18  9 13 18  5 13 16  4  1 15
  5 10 12  7  9 12  8  6  9  4  3  3  8 10 10  8 16  6  8  1 18 13 17 13
  5  8 15  4  1  4  9 20 16 12  9 18 10 13 18 18  5 13  3  7  7 17  5 16
  9 16  3  5  9  9 10  3 18  8  5  4  9 11 17 16 13  6  9 10  8 10 18 17
  4  9  3  1  1 15 10  9 12 16 13 10 18 13  4

100000
[11  8  8 15  3 18  4  5  8  8 16  1 18  9 13  3 14 20 13 18 17 17 11 13
  4 18  1 11  1  6 15 16 12 18  6  9 16 16 10  8 12  9 10 10 12 15 15 16
 18  1 15 18 16  1 17 13  6  9 17 14 14  8 12 20  5 10  8 12 12  3  5  7
 10 18  3 10 13  6 20  6  5  1  9 18 13 10  3 18  9 15 14 19  6  9 11 18
  4 17 20 10 16 15 15  4 12 10 15 10 18 10 10 10  8  3  8 15  7  1 13 16
  9  4  3 18  4 11 20 14 20 10  1  7 20 12 15 13  7  1  8 10 18 17  9  1
  3  9  8 16 15  6 12 20 13  9  7 10  9  8  8  9 14 17 10 12 18 10 13 12
 17 13  8  8 18 17 16 16  4 17  6 10  6 15  3  4 18 19  3 10 18  4  9  7
 10 18 13 10 13  4 13  4 10  1  1  4  1  3  8 13 16  4  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

150000
[11 16  2  8 10  7  8  4 17 16 17  3 18  2 16 18  1 18 16 12  3  1  4  2
  8  5 12  9  4  3 20 16  6 13  7  7 12  4 17 10  6  1  5 18  3  4 16 10
 16  5  8  3 16  7  6 10  9 10  3  1 18  1 18 16  2  6 13  6 16 20 17  6
 10 15  8  6 17 16  8  1  9  6  8  2 20  6 15  3 18  9 10  8  1  8 13 17
 10  4 10 10  1 18 13 18 10 10 12  4  7 13 14  4  4  3  1 10 10 18 13 11
  8  3  1 15 15 11  4 18 20  1 14 18  5  3 15  1 10  7  4 18 15 13  8 14
  1  3  8 18 17  1  4 17 20  9  4 20 10  3  9  6 18 18 20  5  5  6 12  6
  1  6  9  2 11 12  4  8 12  7 13 12  1 15 10 18  4  6  8 14 13 10  1  9
 12 11  5 13 10  1  3  9 15  5  1  4  6  9  5  4  3 18  1 20  5 18 13 10
 20 10  9  3  5 18  1  9 11 13  9  9 14 10  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

200000
[11  3  9 12  9  8  4  8  9  3 10 20 10  8  5  6 16  4 15 12  9  1  5  9
 11 10  9  9  6  9 16  9 16  4  8 10  9  4 17  6  2 17 18  1 18 12 12  1
 16 10 16 18 15  4  6  4  8  5 18  8 11  6 10 16  6 16  6  9 16 16 10 10
 15  2  8 12 15 10 12 15 13 17 16  6  4  8 10  8 12  6  4  3  8  1  6 18
 13  3  4 15 10 15 16 10 15 15  9  4 10  1 11 18  5 14  7  5  6 10 11 13
  7 20 17 18 10 14 12  8  1  5  6 10  4 10 14  6  9  4  9 15  4 15  4 14
  9  1  1  4 16 11 15 10 18  6 10  3  6 20  1  3 10 11 18 12  4 10 16  6
  6 11 14 14 15 18  6 10  1 15  1 10  1 12  3 13  4 18 10 10 11  3  4  1
  5 16  1 10  3 13 10  8 15 18 14 11 14  3  4 10 10  1 10 14 16  9 18  9
  9 17  8 18  5  8 17  7  3 10  4  4  1 18  9 10  6  3 15  8  1  8 11 15
  3  6  4  8 14 14  8  6 17 16  4  4  8 10 17  3 13  1 12  9 20 18 15  2
  5 18  4 12 18  3 15 16 15  8  8 17  1  1 16  8 11 18  3  9 13 11 18  1
 15 14  6  9  4  6 13  4 18 10  8 15  9 11 15  4 15  9  8 17 18 10 13 18
 18  3 13  3  6  8 10 18  6  4 18  7 10  9  

250000
[11  1 12 10  9 18  4 12 10  1  9 16 20  3  6  9 17 12  8 10  4 12 18 16
  5  3 18  1  3  6  4  5 18 16  8 10  6 13 16  6  2  6  9 17 17 17 10 15
  8  8  1  6 10 10  4 14 17  6  6  9  8 10 18  3  6  4  3  8 12  9 18 13
 18 20  9 15 16  5  6 11 18  5 14 16 20  1 10  5 13  7 11 17 18  5  3 12
 18  1  5  6 10  9 11 15  9 17 17 16 16  4  1  9  9 15 18 17  4  8 10  9
 17 17  6 10 12  3 10  1  3 15 20 13  1  4 10 16  6  6 14 14 14 15 18 16
 10  1 15  6 10 18  8 14 13 15  8 10 10 11  3  4 13 10 16 12 10  3  1  9
 10 15  8 14 11 15  4  4  8 15 12 10 14 14  9  5  9  8 17 18 10  5 18 17
  7  3 14  4  4  2  5  1  8 16  3  9 18 10 18 11  9 12  6 15 18  4 14  5
  3 17 13 14  9  8 20  7 14 13  1 17  9 20 18  1 14  5  8  6 20  4 12  5
  8  4 18  3  4  9 10  1 12 12 15  5  9 17  9 12  5  4  5 17 17 16  4  9
 16 17  4 17 12  5 10 17  8 15 13  4 12  8 16 10  8  1  3  3 14 17  6 12
 14 12  4 10 13  6  9  8  8 16  1  4 20 10  6 15  3 20 15 20 14  8  9 17
  3 10  6 18  5 15 18  3  4 18 16  9  5 14 1

300000
[11 16 11  8  4  5  7 12 18  4  9 20 20  6  3 20  7  1 10  7 12  8 12 10
  9  8  3  6  6  4 17 18 18 10  8  6 13 16  6 16  6  9 16 17 10  8 15 17
  8 12  6 10  4 17  8 14  9  6 14 10  8 18 12  6 14  3 10  1  4  9  9 17
  3 11 12 15  8 15  9 12 18  6 11 18  5 14  7  5 12 10 20  1 12  9 16  8
 10  4 12  8 11 10  1 13 15 11 18 10  9 15 13  4  4  4 12 15  9  8  1 11
  3 10 10  3 15 18  3 10 18 12 14  1  4  9  5 13 16 14 10 16  6  6 14 14
 14 15  8  1  8  1 15 16 10  1 11  9 13  9  1 10 10  5  3  4 13 17 16  1
 10  3 13  4 11  8  3  3 18 10 12 18 11  9  3  8  1  9  3 16  3 11 17 11
 10 18 18 17  7  4 11  6  5  1 15  4 18  1 12 15 18  8  5 11  1  3  6 14
  8 10  4  3  3 17  9  4  9  5  5  3  6 14 13 17 12  4 15  1 15 14  5 10
  6  9  8  8 17  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

350000
[11 14 17  8 14 17 17  9  8 20  1  9 18  3  4  1 18 16 14  6 20 17 17  8
 16  1 14  6 16 16 15 16 16  9 17 20 12  8 10  8 19 10  8 18 20 10 10 14
  7 13 12 17 15 10 16  8 18 15  1 17 10 13  1 10  9  6 16 18  5  8  3  5
  9  4  8  2 20  9 11  6  8  5  3  4  9  1 10 12  9 16  4 10  8 20 17  5
 13 12  6 16 19 18  4  5  5 16 17  3 16  4 14  9  8 15  6 15  9 15  3  8
 10 20  2 12  4  1 12  4 10  9  5  8  4 19 14 14 10  9 11 15 17 17 15  5
  1  8 18  3 20 12 13 16  5 16  3  3  7 19 10  2  1 10 12 15  3 13 15 17
 20  7  5 18 16 17 20  9  3 12 13  5 10  4 14 12  8  8  3  4  8  4 16 10
 15  4  9 12  1 17 10 19 14 18 20  6 10  6 14 14  1 11  8  4  6 10  8  5
 17 12 18  4 18  8  3 17  8 13  7 19 18  9  9 15  6  8  6 11  3  5  6 20
 17 12  3 13 17  1  8 11  4  2  1 18 18  3 12  3 10 20  8 12  4  8  1 20
 14 17 14 11 10 16 16  3  8  8  9  8 10  9 13 10 12 10  9 18  8 16  4 16
  1  3 13 15 10  8 14  4  8 16 12  1  6 18 10  8 20 13 18 14  9 20 14  6
 16  8 11  6  6  8 17  9 11 10  3 11 15 10  

400000
[11 11 17 18  8 18 10 16  6 13  8  6  1  6  9 16 16 10 17 16  8 10  1  4
  7 10  6 16 12  1  5 20  4  3 18 16  9 12 13 18 10 13 10 20 20  9  3 11
  9 15 20 17  5 10 10 12 17 20 10 10 12  9 15 10  1 14  8 12  4  1 10 16
  4 12 12 13 12 18 18 10  3 15 16  8 20  4  3  1 10  5  5  9 11 12  1  3
 16  6  8  1  3 13 17  4  2  3  8 20 18  3 10 18  7 12 11 11  4  3 18 13
  6 12 13  7  9  9 13  3 10 10  8 20  8 12 18 16 10  3 12 11 10 16 15  8
  9 15 15  6 15  4  5  4 14 10  4 17  3 13 16 10  9  4 20 20  1 15 10 15
 12 20 20  6 13 19 20  4 12 20 16  4 16 13  9 11 18  8  3  6  3  9 20 12
  5 18  3 12  4  3  3 15  9  1 18 10  4 10  8  3 12  9 10  9  4 10  6 12
 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

450000
[11 14  5 20  8 14 15  9  4 14  5  1  1 14 10 16 14 10  9 14 16 20 12 10
  8 16 10 10 15  1  1  1  1 18  5 18  8  6 16 18 20  5  1  8 16 16 18 17
  1 15  3 17  1 18 10  2  8 10 10  6  2  6  1  5  8 18 10 10 15 20  7 12
 17 10 16 10  9  9  9  9  1  4  1 10 10  9 18 12  4  3  4 18 12  5 10  4
 20 12  9 10 13  5  4 16  6 16  4  5  1  3 17 16  7 13 20  1  7  3 10  3
  8  5  6 12  7 16 10 20 14  7 10 12 15  1  1 17 20  9  6 15 15  9 10  1
  3 11 10 10 12 10 10 13 12 12  4  8 10  9 12 14  1  1  8  9  4 10  6 12
 10  8  4  5 15 14  4  8 10  1 14  6 15  8 12 12  3 16  9 13 18 20 14  9
  8 17  1 19 18  1  9 13  1 13 18  8 17 17  8  1 15  8 10  6 17  1  1 13
  8 10 10 18  6 10 10 10 20 20 19  8  1  6  3  3 16  8 18 13  5  8 12 20
 10 10  1  5 12 18  1  5  1 10  9 20 18  9 10  8  1 12  4 18  9  3 17 17
  4 18  1  4  8  8  9  6 20  1  1  8 10  4  8  8  4  9 14 14  5  4 16  4
  9 10 12  3 10  9  1 15  8  9  3  3 12 14 14  1 16  9 14 10  1 15 10  6
 17 10  5 16 14 10  3 16  1 16 12  8 10  1  

500000
[11 17 17 10  4  8  9  3 10  7 18  4 18  1 12 13  3  4 16  6  4 13  8  9
  8 10  9  6 18  3 10 17  8  9 16  6  4 17  7  1  8 11  6 13 12  6 16  6
  9 16 17 10 16 20  1  8  1  6  7 13  9 20 17 18 17 16  6 16  8 17 10  3
  6  4  3 18 10  1 11 16 18  3  4 15  1 15  1  6 10  5 10  1 11 14 20 13
 18  4 18 13  6 18 16 17 16 12  5 10 15 17  1  1 17  1 18 15  6  4  1 13
  9 10 15  7 19 18  9  4 18  9  4 16 11  1  4 10  4  8  3 13  1  5  1  4
 15 12 18 12  4  6  5 16  6  6  4  9  9 15  7  4  8 10 14 10 16 10 10  9
 13  9  8  1  8 10  3  4 17  3 16  6 10  3 18  3  1 10 15  8 18 16  4  6
 18 12 15 20  9  4 15  4 12  6  6 18 10 10  8 17  7 20 17 15  8 10 15 20
  8 14 13  4  5 18  7 18  5 18  6  6 15  8 18  1  4  6  6 16  4 10  1  3
  4 10  3  1 12  6 20 18 15  5 17 14 16  1 17  1  6  1  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

In [33]:
pdt_seq_motor[0]

'MSSIVVVGANWGDEGKGRIVDYLAGQAGASIRFQGGNNAGHTVVNDLGTFKLHQVPSGVFNPDCLVVLGPGMVISPEKLTVELEEVKASGVTPKLAISDRATLCLPLHALEDTLEEQRLGDGAYGSTRQGIAPAYGDRVMKKAILVGWLKQPDVLVERIQFMLDWKLPQMKAIYPSFEFTQTAQEMADWLLEVSAPWIDAVCNVSMPLKALQAEGKTLLFEAQLGAGRDLIYGEYPWVTSSHVSGAYAGIGGGLPGLRPERVIAVAKAFSSSVGTGTLLTAMENQDEFRKITNEFGATTGRPRDVGYFDAVATKNGVELQAATEVALTKLDCLTGLPDLKICVAYEGAHTENPIWPQTAALKPVYEQMESWSEDITGCRTFEELPKAAQQYVLRIEELLGVPVPMVSVGPGRDEMILR'

In [34]:
pdt_motor["seq"] = pdt_seq_motor

In [35]:
pdt_motor.head()

Unnamed: 0,uniprot_id,pfam_id,is_thermophilic,token,seq
0,K1INX9,PF00709,0,"[11, 16, 16, 8, 18, 18, 18, 6, 1, 12, 19, 6, 3...",MSSIVVVGANWGDEGKGRIVDYLAGQAGASIRFQGGNNAGHTVVND...
1,A9AVV2,PF00006,0,"[11, 1, 18, 17, 1, 4, 3, 8, 10, 16, 15, 10, 9,...",MAVTAEDILSRLKASIQQPVGGDPTAVNVGTVASVGDGVARISGLR...
2,L9Y3C9,PF13361,0,"[11, 8, 17, 17, 15, 2, 3, 3, 18, 18, 4, 3, 17,...",MITTRCDDVVEDTYHVRVLTIHASKGAEATDDCCFDGITGEIAREM...
3,A0A062ACI1,PF03618,0,"[11, 16, 4, 16, 9, 14, 5, 9, 15, 16, 18, 5, 5,...",MSESKQFKRSVFFISDGTAITAETLGHSLLAQFPNVDFDIHIMPYI...
4,A0A102DZZ7,PF00480,0,"[11, 14, 12, 8, 18, 6, 8, 3, 8, 6, 6, 16, 7, 8...",MQNIVGIDIGGSHITLAQVDPDKHEIITSTYVRERVDSFADPETIF...


In [40]:
pdt_motor["clan"] = "na"

In [45]:
in_p_loop_gtpase = pd.Series(pdt_header_motor[:,1]).isin(p_loop_gtpase)
in_tubulin_binding = pd.Series(pdt_header_motor[:,1]).isin(tubulin_binding)
in_tubulin_c = pd.Series(pdt_header_motor[:,1]).isin(tubulin_c)
in_actin_like = pd.Series(pdt_header_motor[:,1]).isin(actin_like)
print(sum(in_p_loop_gtpase))
print(sum(in_tubulin_binding))
print(sum(in_tubulin_c))
print(sum(in_actin_like))
pdt_motor.loc[in_p_loop_gtpase,"clan"] = "p_loop_gtpase"
pdt_motor.loc[in_tubulin_binding,"clan"] = "tubulin_binding"
pdt_motor.loc[in_tubulin_c,"clan"] = "tubulin_c"
pdt_motor.loc[in_actin_like,"clan"] = "actin_like"

443702
3963
70
53726


In [51]:
pdt_motor.groupby(["clan","is_thermophilic"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,uniprot_id,pfam_id,token,seq
clan,is_thermophilic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
actin_like,0,50106,50106,50106,50106
actin_like,1,3620,3620,3620,3620
p_loop_gtpase,0,419235,419235,419235,419235
p_loop_gtpase,1,24467,24467,24467,24467
tubulin_binding,0,3685,3685,3685,3685
tubulin_binding,1,278,278,278,278
tubulin_c,0,68,68,68,68
tubulin_c,1,2,2,2,2


In [52]:
# perform a sanity check on 10 thermophilic pfam proteins (by checking their taxonomy)
pdt_motor.loc[pdt_motor["is_thermophilic"]==1,:].iloc[0:10,:]

Unnamed: 0,uniprot_id,pfam_id,is_thermophilic,token,seq,clan
13,V9S5T6,PF00005,1,"[11, 16, 13, 10, 10, 14, 18, 9, 6, 10, 9, 18, ...",MSPLLQVKGLKVYYYTKRGIIKAVDDVSIDVEKGEVVGLAGESGSG...,p_loop_gtpase
28,F2LWD8,PF00308,1,"[11, 19, 4, 9, 18, 8, 4, 12, 10, 9, 12, 9, 18,...",MWEKVIENLKNKVETQDLEKLSLVEPIFVSEDKITLAIPGEQIKEL...,p_loop_gtpase
51,I0I5P7,PF01580,1,"[11, 6, 20, 15, 15, 15, 17, 10, 4, 11, 14, 1, ...",MGYRRRTLEMQADRIEAVLERHRVQARVTGGAITPRFVRFQLVAEG...,p_loop_gtpase
81,A0A0A8WZL0,PF02283,1,"[11, 1, 4, 14, 16, 16, 10, 8, 5, 8, 16, 6, 6, ...",MAEQSSLIFISGGARSGKSRFAEILAGKWEAEVSGQLHYVAAGQPS...,p_loop_gtpase
83,R7RT93,PF00005,1,"[11, 6, 4, 10, 10, 16, 8, 9, 12, 10, 9, 17, 20...",MGELLSIKNLKTYFYTEDGVIKAVDDVSLTIREGETLGVVGESGCG...,p_loop_gtpase
124,A5D0H6,PF01656,1,"[11, 15, 10, 19, 14, 13, 15, 17, 15, 16, 8, 20...",MRLWQPRTRSIYRGAAVKKERPAAGPRAIAVTSGKGGVGKTSLVVN...,p_loop_gtpase
125,C9R9G6,PF00485,1,"[11, 15, 15, 4, 13, 18, 10, 18, 6, 8, 1, 6, 3,...",MRREPVLVGIAGDSGAGKSTFVRKLQELLGHERVSVIELDGYHSLN...,p_loop_gtpase
132,I3EAK2,PF00005,1,"[11, 3, 17, 7, 3, 13, 8, 18, 15, 8, 9, 12, 10,...",MDTHDPIVRIKNLSYRYEKENVLENINLSIPKGSFLGIVGPNGSGK...,p_loop_gtpase
147,A0A0B0SDC6,PF13614,1,"[11, 9, 9, 17, 18, 10, 8, 13, 10, 9, 3, 19, 1,...",MKKTVLIPLKDWALEQGLSYSGALKQIQEGRLNARRLGRYWYVVKE...,p_loop_gtpase
159,Q8RCQ9,PF00005,1,"[11, 18, 6, 4, 16, 6, 16, 6, 9, 16, 17, 8, 6, ...",MVGESGSGKSTIGKMMLKLLKPSGGQILLNDKDIYKIKNNKEFYSK...,p_loop_gtpase


- F2LWD8_HIPMA: Hippea maritima is a bacterium from the genus of Hippea which has been isolated from sediments from a hydrothermal vent from Matupi Harbour in Papua New Guinea
- A0A0A8WZL0: B.selenatarsenatis is a mesophile with its optimal growth temperature between 25~40 degrees Celsius, and a pH between 7.5~9.0.
- I3EAK2: Bacillus methanolicus MGA3, was isolated from freshwater marsh soils, and grows rapidly in cultures heated to up to 60 °C using only methanol as a carbon source. 

In [62]:
# export pdt_motor to a csv file

In [61]:
pdt_motor.to_csv("../../data/thermo/pdt_motor.csv",index = False)