In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./data/train.csv')
data['length'] = data['protein_sequence'].apply(lambda x:len(x))
minv,maxv = data['tm'].min() , data['tm'].max()
data['targ'] = data['tm'].apply(lambda x: (x-minv)/(maxv - minv))
data.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm,length,targ
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7,341,0.585496
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5,286,0.39313
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5,497,0.316794
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2,265,0.367939
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5,1451,0.385496


In [3]:
features = {}
for sid, seq in data[['seq_id','protein_sequence']].values.tolist():
    features[sid] = {'seq_id':sid}
    for s in seq:
        features[sid][s] = features[sid].get(s,0)+1

df = pd.DataFrame.from_dict(features,orient='index')
df.fillna(0,inplace = True)
df.head()

Unnamed: 0,seq_id,A,K,L,G,E,P,V,D,I,...,R,Q,F,M,T,H,S,N,Y,C
0,0,45.0,16.0,37.0,38.0,30.0,18.0,37.0,13.0,14.0,...,25.0,6.0,13.0,8.0,14.0,3.0,11.0,5.0,3.0,1.0
1,1,28.0,19.0,23.0,18.0,52.0,8.0,13.0,10.0,13.0,...,30.0,22.0,6.0,2.0,12.0,4.0,14.0,6.0,3.0,0.0
2,2,50.0,39.0,18.0,65.0,32.0,20.0,30.0,27.0,16.0,...,31.0,25.0,21.0,6.0,30.0,11.0,33.0,15.0,16.0,9.0
3,3,20.0,17.0,28.0,16.0,29.0,16.0,14.0,19.0,10.0,...,10.0,9.0,12.0,2.0,19.0,7.0,16.0,9.0,4.0,5.0
4,4,86.0,68.0,104.0,84.0,78.0,128.0,124.0,78.0,71.0,...,63.0,54.0,32.0,31.0,120.0,40.0,148.0,65.0,47.0,14.0


In [4]:
data = data.merge(df,on='seq_id')
data.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm,length,targ,A,K,L,...,R,Q,F,M,T,H,S,N,Y,C
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7,341,0.585496,45.0,16.0,37.0,...,25.0,6.0,13.0,8.0,14.0,3.0,11.0,5.0,3.0,1.0
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5,286,0.39313,28.0,19.0,23.0,...,30.0,22.0,6.0,2.0,12.0,4.0,14.0,6.0,3.0,0.0
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5,497,0.316794,50.0,39.0,18.0,...,31.0,25.0,21.0,6.0,30.0,11.0,33.0,15.0,16.0,9.0
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2,265,0.367939,20.0,17.0,28.0,...,10.0,9.0,12.0,2.0,19.0,7.0,16.0,9.0,4.0,5.0
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5,1451,0.385496,86.0,68.0,104.0,...,63.0,54.0,32.0,31.0,120.0,40.0,148.0,65.0,47.0,14.0


In [5]:

m_ph = data['pH'].mean()
data['pH'].fillna(m_ph,inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31390 entries, 0 to 31389
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seq_id            31390 non-null  int64  
 1   protein_sequence  31390 non-null  object 
 2   pH                31390 non-null  float64
 3   data_source       28043 non-null  object 
 4   tm                31390 non-null  float64
 5   length            31390 non-null  int64  
 6   targ              31390 non-null  float64
 7   A                 31390 non-null  float64
 8   K                 31390 non-null  float64
 9   L                 31390 non-null  float64
 10  G                 31390 non-null  float64
 11  E                 31390 non-null  float64
 12  P                 31390 non-null  float64
 13  V                 31390 non-null  float64
 14  D                 31390 non-null  float64
 15  I                 31390 non-null  float64
 16  W                 31390 non-null  float6

In [6]:
x_cols = [col for col in data.columns.values if col not in ['targ','seq_id','protein_sequence','data_source','tm']]
X = data[x_cols].values 
Y = data['targ'].values

In [7]:
X = scale(X,axis=0)

In [9]:
from sklearn.ensemble import RandomForestRegressor as RFR 

model = RFR(n_estimators=100,max_features=0.9)

metrics = []
for i in range(30):
    trainx,testx,trainy,testy = train_test_split(X,Y,test_size=.2)
    model.fit(trainx,trainy)
    preds = model.predict(testx)
    mse = mean_squared_error(preds,testy)
    metrics.append(mse)
    print(mse)

print(np.array(metrics).mean())

0.004805416334668015
0.004699995414789303
0.004798578562906236
0.00461644307108966


KeyboardInterrupt: 

In [11]:
0.047 * (maxv - minv)

6.157