In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from Bio import SeqIO, Phylo
%matplotlib inline

In [4]:
# read numerical responses
binary = pd.read_csv('../data/pseudo/responses-pseudo.csv')
binary.rename(columns = {'OriginalID':'id', 'LabID': 'strain'}, inplace = True)
num = pd.read_csv('../data/pseudo/Perron_phenotype-GSU-training.csv')[['strain','carb.lag.delta','toby.lag.delta']]
resp = pd.merge(binary, num, on = 'strain')
resp

Unnamed: 0,id,strain,carb,toby,carb.lag.delta,toby.lag.delta
0,TA151,210.0,True,False,-2,16
1,IC1,55.0,False,False,2,14
2,A237,14.0,True,False,-1,4
3,LiA96,175.0,False,False,0,18
4,LiA91,174.0,False,False,1,19
...,...,...,...,...,...,...
114,JD318,360.0,False,False,27,0
115,Jp238,126.0,False,False,1,21
116,Jp1303,134.0,False,False,16,9
117,JD304,351.0,False,False,6,1


In [5]:
# read gene sequences
src = SeqIO.parse('../data/pseudo/concatenated.fasta', 'fasta')
seq = [(record.id, record.seq._data) for record in src]
seq_df = pd.DataFrame(data = seq, columns = ['id', 'sequence'])
data = pd.merge(seq_df, resp, on = 'id').drop(columns=['carb', 'toby'])
data

Unnamed: 0,id,sequence,strain,carb.lag.delta,toby.lag.delta
0,TA151,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,210.0,-2,16
1,IC1,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,55.0,2,14
2,A237,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,14.0,-1,4
3,LiA96,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,175.0,0,18
4,LiA91,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,174.0,1,19
...,...,...,...,...,...
114,JD318,----------------------------------------------...,360.0,27,0
115,Jp238,----------------------------------------------...,126.0,1,21
116,Jp1303,----------------------------------------------...,134.0,16,9
117,JD304,----------------------------------------------...,351.0,6,1


# Encoding

In [6]:
from numpy import array
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [7]:
# integer encoding
def base2integer(str):
    label_encoder = LabelEncoder()
    label_encoder.fit(['A','T','C','G','-'])
    for i in str.split():
        return label_encoder.transform(list(i))

def integer(series):
    label_encoded = series.apply(base2integer)
    return pd.DataFrame(label_encoded.to_dict()).transpose()

data.dropna(inplace=True)
label_encoded = integer(data['sequence'])

In [8]:
# one-hot encoding
def onehot(series):
    label_encoded = integer(series).to_numpy()   
    onehot_encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    onehot_encoded = onehot_encoder.fit_transform(label_encoded)
    return onehot_encoded

seq_encoded = onehot(data['sequence'])
seq_encoded

<119x814557 sparse matrix of type '<class 'numpy.float64'>'
	with 57516627 stored elements in Compressed Sparse Row format>

# Imputation

In [19]:
seq_imputed = data['sequence'].replace('-', np.NaN)
naive = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
%time naive.fit(seq_encoded)

CPU times: user 39.5 s, sys: 3.35 s, total: 42.8 s
Wall time: 45.3 s


# Training

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor

## 1) Integer Encoded

In [10]:
X = label_encoded
y1, y2 = data['carb.lag.delta'], data['toby.lag.delta']
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.3)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.3)

### Linear Regression

In [11]:
lr = LinearRegression()
lr.fit(X1_train, y1_train)
y1_pred = lr.predict(X1_test)
print('carb')
print(f'R^2: {r2_score(y1_test, y1_pred)}')
print(f'MAE: {mean_absolute_error(y1_test, y1_pred)}')
print(f'MSE: {mean_squared_error(y1_test, y1_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y1_test, y1_pred))}')

carb
R^2: -0.7327962038931339
MAE: 9.960322443254148
MSE: 160.1379121423492
RMSE: 12.654560922542876


In [12]:
lr.fit(X2_train, y2_train)
y2_pred = lr.predict(X2_test)
print('toby')
print(f'R^2: {r2_score(y2_test, y2_pred)}')
print(f'MAE: {mean_absolute_error(y2_test, y2_pred)}')
print(f'MSE: {mean_squared_error(y2_test, y2_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y2_test, y2_pred))}')

toby
R^2: -0.10449908479344239
MAE: 8.192469108206023
MSE: 107.80030380756737
RMSE: 10.38269251242506


### Support Vector Machine

In [13]:
svm = svm.SVR(kernel = "linear", C=0.01)
svm.fit(X1_train, y1_train)
y1_pred = svm.predict(X1_test)
print('carb')
print(f'R^2: {r2_score(y1_test, y1_pred)}')
print(f'MAE: {mean_absolute_error(y1_test, y1_pred)}')
print(f'MSE: {mean_squared_error(y1_test, y1_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y1_test, y1_pred))}')

carb
R^2: -0.7287787293003767
MAE: 9.94802681973126
MSE: 159.76663363197179
RMSE: 12.639882658947897


In [14]:
svm.fit(X2_train, y2_train)
y2_pred = svm.predict(X1_test)
print('toby')
print(f'R^2: {r2_score(y2_test, y2_pred)}')
print(f'MAE: {mean_absolute_error(y2_test, y2_pred)}')
print(f'MSE: {mean_squared_error(y2_test, y2_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y2_test, y2_pred))}')

toby
R^2: -1.1365290070672298
MAE: 11.596263068372004
MSE: 208.52753906862728
RMSE: 14.440482646664801


### Random Forest

In [15]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=5)
rf.fit(X1_train, y1_train)
y1_pred = rf.predict(X1_test)
print('carb')
print(f'R^2: {r2_score(y1_test, y1_pred)}')
print(f'MAE: {mean_absolute_error(y1_test, y1_pred)}')
print(f'MSE: {mean_squared_error(y1_test, y1_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y1_test, y1_pred))}')

carb
R^2: -0.2402455519282629
MAE: 8.00138888888889
MSE: 114.61840277777777
RMSE: 10.705998448429636


In [16]:
rf.fit(X2_train, y2_train)
y2_pred = rf.predict(X2_test)
print('toby')
print(f'R^2: {r2_score(y2_test, y2_pred)}')
print(f'MAE: {mean_absolute_error(y2_test, y2_pred)}')
print(f'MSE: {mean_squared_error(y2_test, y2_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y2_test, y2_pred))}')

toby
R^2: 0.12069970827964038
MAE: 7.22361111111111
MSE: 85.82065833333334
RMSE: 9.263943994505436


## 2) One-hot Encoded

In [None]:
X = seq_encoded.todense()
y1, y2 = data['carb.lag.delta'], data['toby.lag.delta']
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.3)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.3)

### Linear Regression

In [53]:
lr = LinearRegression()
lr.fit(X1_train, y1_train)
y1_pred = lr.predict(X1_test)
print('carb')
print(f'R^2: {r2_score(y1_test, y1_pred)}')
print(f'MAE: {mean_absolute_error(y1_test, y1_pred)}')
print(f'MSE: {mean_squared_error(y1_test, y1_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y1_test, y1_pred))}')

carb
R^2: -0.19706966704240303
MAE: 8.815375519133855
MSE: 142.86047391392546
RMSE: 11.952425440634443


In [54]:
lr.fit(X2_train, y2_train)
y2_pred = lr.predict(X2_test)
print('toby')
print(f'R^2: {r2_score(y2_test, y2_pred)}')
print(f'MAE: {mean_absolute_error(y2_test, y2_pred)}')
print(f'MSE: {mean_squared_error(y2_test, y2_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y2_test, y2_pred))}')

toby
R^2: -0.027744502449325692
MAE: 8.736252612665318
MSE: 134.82723191507094
RMSE: 11.61151290379815


### Support Vector Machine

In [55]:
svm = svm.SVR(kernel = "linear", C=0.01)
svm.fit(X1_train, y1_train)
y1_pred = svm.predict(X1_test)
print('carb')
print(f'R^2: {r2_score(y1_test, y1_pred)}')
print(f'MAE: {mean_absolute_error(y1_test, y1_pred)}')
print(f'MSE: {mean_squared_error(y1_test, y1_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y1_test, y1_pred))}')

carb
R^2: -0.19538765965343852
MAE: 8.798782319645754
MSE: 142.65974008921174
RMSE: 11.944025288369568


In [56]:
svm.fit(X2_train, y2_train)
y2_pred = svm.predict(X1_test)
print('toby')
print(f'R^2: {r2_score(y2_test, y2_pred)}')
print(f'MAE: {mean_absolute_error(y2_test, y2_pred)}')
print(f'MSE: {mean_squared_error(y2_test, y2_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y2_test, y2_pred))}')

toby
R^2: -0.9287409860405842
MAE: 12.704333328296098
MSE: 253.02670810619918
RMSE: 15.906813260555968


### Random Forest

In [51]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=5)
rf.fit(X1_train, y1_train)
y1_pred = rf.predict(X1_test)
print('carb')
print(f'R^2: {r2_score(y1_test, y1_pred)}')
print(f'MAE: {mean_absolute_error(y1_test, y1_pred)}')
print(f'MSE: {mean_squared_error(y1_test, y1_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y1_test, y1_pred))}')

carb
R^2: -0.2879026476236042
MAE: 8.804166666666667
MSE: 153.70064722222224
RMSE: 12.397606511832123


In [52]:
rf.fit(X2_train, y2_train)
y2_pred = rf.predict(X2_test)
print('toby')
print(f'R^2: {r2_score(y2_test, y2_pred)}')
print(f'MAE: {mean_absolute_error(y2_test, y2_pred)}')
print(f'MSE: {mean_squared_error(y2_test, y2_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y2_test, y2_pred))}')

toby
R^2: 0.27553835159599827
MAE: 7.347888888888887
MSE: 95.0403125
RMSE: 9.748862113087865
