In [1]:
import sys
sys.path
sys.path.append("/usr/local/lib/python3.7/site-packages")
import numpy as np
import pandas as pd
from Bio import SeqIO
from sklearn.impute import SimpleImputer
%matplotlib inline

In [2]:
# read responses
resp = pd.read_csv('../data/pseudo/responses-pseudo.csv')
resp.rename(columns = {'OriginalID':'id', 'LabID': 'strain'}, inplace = True)

In [3]:
# read gene sequences
src = SeqIO.parse('../data/pseudo/concatenated.fasta', 'fasta')
seq = [(record.id, record.seq._data) for record in src]
seq_df = pd.DataFrame(data = seq, columns = ['id', 'sequence'])
data = pd.merge(seq_df, resp, on = 'id')
data

Unnamed: 0,id,sequence,strain,carb,toby
0,TA151,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,210.0,True,False
1,IC1,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,55.0,False,False
2,A237,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,14.0,True,False
3,5920,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,,,
4,LiA96,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...,175.0,False,False
...,...,...,...,...,...
117,JD318,----------------------------------------------...,360.0,False,False
118,Jp238,----------------------------------------------...,126.0,False,False
119,Jp1303,----------------------------------------------...,134.0,False,False
120,JD304,----------------------------------------------...,351.0,False,False


# Encoding

In [4]:
from numpy import array
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [5]:
# integer encoding
def base2integer(str):
    label_encoder = LabelEncoder()
    label_encoder.fit(['A','T','C','G','-'])
    for i in str.split():
        return label_encoder.transform(list(i))

def integer(series):
    label_encoded = series.apply(base2integer)
    return pd.DataFrame(label_encoded.to_dict()).transpose()

data.dropna(inplace=True)
label_encoded = integer(data['sequence'])

In [6]:
# one-hot encoding
def onehot(series):
    label_encoded = integer(series).to_numpy()   
    onehot_encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    onehot_encoded = onehot_encoder.fit_transform(label_encoded)
    return onehot_encoded

seq_encoded = onehot(data['sequence'])
seq_encoded

<119x814557 sparse matrix of type '<class 'numpy.float64'>'
	with 57516627 stored elements in Compressed Sparse Row format>

# Imputation

In [4]:
seq_imputed = data['sequence'].replace('-', np.NaN)
naive = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
naive.fit(seq_encoded)

In [5]:
# %timeit s = pd.DataFrame.from_dict(data['sequence'].apply(list).to_dict()).transpose()
# naive = lambda column: column.where(column!='-', column.value_counts().idxmax())
# seq_imputed = s.apply(naive)

# Training

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, r2_score, mean_absolute_error, mean_squared_error
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

## 1) Integer Encoded

In [60]:
X = label_encoded
y1, y2 = data['carb'].astype('bool'), data['toby'].astype('bool')
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, stratify=y1)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, stratify=y2)

### Logistic Regression

In [13]:
lr_l1 = LogisticRegression(C=1, tol=0.01, penalty='l1', solver='saga', class_weight='balanced')
lr_l1.fit(X1_train, y1_train)
y1_pred = lr_l1.predict(X1_test)
print('carb, penalty=l1')
print(f'Accuracy Score: {lr_l1.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')

carb, penalty=l1
Accuracy Score: 0.8333333333333334
F1 score: 0.6153846153846153
Recall score: 0.6666666666666666
Precision score: 0.5714285714285714


In [14]:
lr_l1.fit(X2_train, y2_train)
y2_pred = lr_l1.predict(X2_test)
print('toby, penalty=l1')
print(f'Accuracy Score: {lr_l1.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')

toby, penalty=l1
Accuracy Score: 0.7666666666666667
F1 score: 0.22222222222222224
Recall score: 0.5
Precision score: 0.14285714285714285


In [15]:
lr_l2 = LogisticRegression(C=1, tol=0.01, penalty='l2', solver='lbfgs', class_weight='balanced')
lr_l2.fit(X1_train, y1_train)
y1_pred = lr_l2.predict(X1_test)
print('carb, penalty=l2')
print(f'Accuracy Score: {lr_l2.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')

carb, penalty=l2
Accuracy Score: 0.8333333333333334
F1 score: 0.4444444444444444
Recall score: 0.3333333333333333
Precision score: 0.6666666666666666


In [16]:
lr_l2.fit(X2_train, y2_train)
y2_pred = lr_l2.predict(X2_test)
print('toby, penalty=l2')
print(f'Accuracy Score: {lr_l2.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')

toby, penalty=l2
Accuracy Score: 0.9
F1 score: 0.0
Recall score: 0.0
Precision score: 0.0


In [17]:
lr_en = LogisticRegression(C=1, tol=0.01, penalty='elasticnet', solver='saga', l1_ratio=0.5, class_weight='balanced')
lr_en.fit(X1_train, y1_train)
y1_pred = lr_en.predict(X1_test)
print('carb, penalty=elasticnet')
print(f'Accuracy Score: {lr_en.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')

carb, penalty=elasticnet
Accuracy Score: 0.8
F1 score: 0.5714285714285715
Recall score: 0.6666666666666666
Precision score: 0.5


In [18]:
lr_en.fit(X2_train, y2_train)
y2_pred = lr_en.predict(X2_test)
print('toby, penalty=elasticnet')
print(f'Accuracy Score: {lr_en.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')



toby, penalty=elasticnet
Accuracy Score: 0.6333333333333333
F1 score: 0.15384615384615385
Recall score: 0.5
Precision score: 0.09090909090909091


### Support Vector Machine

In [20]:
svm = svm.SVC(gamma='auto', class_weight='balanced')
svm.fit(X1_train, y1_train)
y1_pred = svm.predict(X1_test)
print('carb')
print(f'Accuracy Score: {svm.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')

carb
Accuracy Score: 0.8333333333333334
F1 score: 0.6153846153846153
Recall score: 0.6666666666666666
Precision score: 0.5714285714285714


In [21]:
svm.fit(X2_train, y2_train)
y2_pred = svm.predict(X1_test)
print('toby')
print(f'Accuracy Score: {svm.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')

toby
Accuracy Score: 0.8333333333333334
F1 score: 0.0
Recall score: 0.0
Precision score: 0.0


### Random Forest

In [61]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=5)
rf.fit(X1_train, y1_train)
y1_pred = rf.predict(X1_test)
print('carb')
print(f'Accuracy Score: {rf.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')

carb
Accuracy Score: 0.8666666666666667
F1 score: 0.6
Recall score: 0.5
Precision score: 0.75


In [65]:
rf.fit(X2_train, y2_train)
y2_pred = rf.predict(X2_test)
print('toby')
print(f'Accuracy Score: {rf.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')

toby
Accuracy Score: 0.9
F1 score: 0.0
Recall score: 0.0
Precision score: 0.0


## 2) One-hot Encoded

In [66]:
X = seq_encoded.todense()
y1, y2 = data['carb'].astype('bool'), data['toby'].astype('bool')
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, stratify=y1)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, stratify=y2)

### Logistic Regression

In [41]:
lr_l1 = LogisticRegression(C=1, tol=0.01, penalty='l1', solver='saga', class_weight='balanced')
lr_l1.fit(X1_train, y1_train)
y1_pred = lr_l1.predict(X1_test)
print('carb, penalty=l1')
print(f'Accuracy Score: {lr_l1.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')
print(roc_auc_score(y1_test, y1_pred))

carb, penalty=l1
Accuracy Score: 0.8
F1 score: 0.4
Recall score: 0.3333333333333333
Precision score: 0.5
0.6249999999999999


In [42]:
lr_l1 = LogisticRegression(C=1, tol=0.01, penalty='l1', solver='saga', class_weight='balanced')
lr_l1.fit(X2_train, y2_train)
y2_pred = lr_l1.predict(X2_test)
print('toby, penalty=l1')
print(f'Accuracy Score: {lr_l1.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')

from sklearn.metrics import roc_auc_score
print(roc_auc_score(y2_test, y2_pred))

toby, penalty=l1
Accuracy Score: 0.7333333333333333
F1 score: 0.2
Recall score: 0.5
Precision score: 0.125
0.625


In [44]:
lr_l2 = LogisticRegression(C=1, tol=0.01, penalty='l2', solver='lbfgs', class_weight='balanced')
lr_l2.fit(X1_train, y1_train)
y1_pred = lr_l2.predict(X1_test)
print('carb, penalty=l2')
print(f'Accuracy Score: {lr_l2.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')
print(roc_auc_score(y1_test, y1_pred))

carb, penalty=l2
Accuracy Score: 0.7666666666666667
F1 score: 0.2222222222222222
Recall score: 0.16666666666666666
Precision score: 0.3333333333333333
0.5416666666666666


In [45]:
lr_l2.fit(X2_train, y2_train)
y2_pred = lr_l2.predict(X2_test)
print('toby, penalty=l2')
print(f'Accuracy Score: {lr_l2.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')
print(roc_auc_score(y2_test, y2_pred))

toby, penalty=l2
Accuracy Score: 0.9
F1 score: 0.0
Recall score: 0.0
Precision score: 0.0
0.48214285714285715


In [46]:
lr_en = LogisticRegression(C=1, tol=0.01, penalty='elasticnet', solver='saga', l1_ratio=0.5, class_weight='balanced')
lr_en.fit(X1_train, y1_train)
y1_pred = lr_en.predict(X1_test)
print('carb, penalty=elasticnet')
print(f'Accuracy Score: {lr_en.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')
print(roc_auc_score(y1_test, y1_pred))

carb, penalty=elasticnet
Accuracy Score: 0.8
F1 score: 0.4
Recall score: 0.3333333333333333
Precision score: 0.5
0.6249999999999999


In [71]:
lr_en.fit(X2_train, y2_train)
y2_pred = lr_en.predict(X2_test)
print('toby, penalty=elasticnet')
print(f'Accuracy Score: {lr_en.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')
print(roc_auc_score(y2_test, y2_pred))

toby, penalty=elasticnet
Accuracy Score: 0.9333333333333333
F1 score: 0.0
Recall score: 0.0
Precision score: 0.0
0.5


  _warn_prf(average, modifier, msg_start, len(result))


### Support Vector Machine

In [56]:
from sklearn import svm
svm = svm.SVC(gamma='auto', class_weight='balanced')
svm.fit(X1_train, y1_train)
y1_pred = svm.predict(X1_test)
print('carb')
print(f'Accuracy Score: {svm.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')
print(roc_auc_score(y1_test, y1_pred))

carb
Accuracy Score: 0.7666666666666667
F1 score: 0.3636363636363636
Recall score: 0.3333333333333333
Precision score: 0.4
0.6041666666666666


In [50]:
svm.fit(X2_train, y2_train)
y2_pred = svm.predict(X1_test)
print('toby')
print(f'Accuracy Score: {svm.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')
print(roc_auc_score(y2_test, y2_pred))

toby
Accuracy Score: 0.7333333333333333
F1 score: 0.0
Recall score: 0.0
Precision score: 0.0
0.44642857142857145


### Random Forest

In [58]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=5)
rf.fit(X1_train, y1_train)
y1_pred = rf.predict(X1_test)
print('carb')
print(f'Accuracy Score: {rf.score(X1_test, y1_test)}')
print(f'F1 score: {f1_score(y1_test, y1_pred)}')
print(f'Recall score: {recall_score(y1_test, y1_pred)}')
print(f'Precision score: {precision_score(y1_test, y1_pred)}')
print(roc_auc_score(y1_test, y1_pred))

carb
Accuracy Score: 0.8
F1 score: 0.25
Recall score: 0.16666666666666666
Precision score: 0.5
0.5625


In [72]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=5)
rf.fit(X2_train, y2_train)
y2_pred = rf.predict(X2_test)
print('toby')
print(f'Accuracy Score: {rf.score(X2_test, y2_test)}')
print(f'F1 score: {f1_score(y2_test, y2_pred)}')
print(f'Recall score: {recall_score(y2_test, y2_pred)}')
print(f'Precision score: {precision_score(y2_test, y2_pred)}')
print(roc_auc_score(y2_test, y2_pred))

toby
Accuracy Score: 0.9333333333333333
F1 score: 0.0
Recall score: 0.0
Precision score: 0.0
0.5


  _warn_prf(average, modifier, msg_start, len(result))
