In [1]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from sklearn.impute import SimpleImputer
%matplotlib inline

In [2]:
resp = pd.read_csv('../data/staph/responses-staph.csv')
resp.rename(columns = {'ids': 'id', 'resp': 'response'}, inplace = True)
src = SeqIO.parse('../data/staph/core_gene_alignment-narsa.aln', 'fasta')
seq = [(record.id, record.seq._data.upper()) for record in src]
seq_df = pd.DataFrame(data = seq, columns = ['id', 'sequence'])
data = pd.merge(seq_df, resp, on = 'id')
data

Unnamed: 0,id,sequence,response
0,NRS001,ATGAACATTTATGATGAATATAGAAGTTATTTAATAGAAGAACTGG...,False
1,NRS002,----------------------------------------------...,False
2,NRS003,ATGAACATTTATGATGAATATAGAAGTTATTTAATAGAAGAACTGG...,False
3,NRS021,ATGAAAATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,False
4,NRS022,ATGAACATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,False
...,...,...,...
120,NRS272,ATGAAAATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,False
121,NRS275,ATGAAAATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,False
122,NRS383,ATGAACATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,False
123,NRS386,ATGAAAATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,True


# Encoding

In [3]:
from numpy import array
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [10]:
# integer encoding
def base2integer(str):
    label_encoder = LabelEncoder()
    label_encoder.fit(['A','T','C','G','N','-'])
    for i in str.split():
        return label_encoder.transform(list(i))

def integer(series):
    label_encoded = series.apply(base2integer)
    return pd.DataFrame(label_encoded.to_dict()).transpose()

data.dropna(inplace=True)
label_encoded = integer(data['sequence'])

In [11]:
# one-hot encoding
def onehot(series):
    label_encoded = integer(series).to_numpy()   
    onehot_encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    onehot_encoded = onehot_encoder.fit_transform(label_encoded)
    return onehot_encoded

seq_encoded = onehot(data['sequence'])
seq_encoded

<124x1509775 sparse matrix of type '<class 'numpy.float64'>'
	with 121902912 stored elements in Compressed Sparse Row format>

# Imputation

In [12]:
seq_imputed = data['sequence'].replace('-', np.NaN)
naive = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
naive.fit(seq_encoded)

# Training

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, r2_score, mean_absolute_error, mean_squared_error
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

## 1) Integer Encoded

In [15]:
X  = seq_encoded.todense()
y = data['response'].astype('bool')
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

### Logistic Regression

In [16]:
lr_l1 = LogisticRegression(C=1, tol=0.01, penalty='l1', solver='saga', class_weight='balanced')
lr_l1.fit(X_train, y_train)
y_pred = lr_l1.predict(X_test)
print(f'Accuracy Score: {lr_l1.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.7096774193548387
F1 score: 0.4
Recall score: 0.6
Precision score: 0.3


In [19]:
lr_l2 = LogisticRegression(C=1, tol=0.01, penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=2000)
lr_l2.fit(X_train, y_train)
y_pred = lr_l2.predict(X_test)
print(f'Accuracy Score: {lr_l2.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.7419354838709677
F1 score: 0.3333333333333333
Recall score: 0.4
Precision score: 0.2857142857142857


In [20]:
lr_en = LogisticRegression(C=1, tol=0.01, penalty='elasticnet', solver='saga', l1_ratio=0.5, class_weight='balanced')
lr_en.fit(X_train, y_train)
y_pred = lr_en.predict(X_test)
print(f'Accuracy Score: {lr_en.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.7419354838709677
F1 score: 0.3333333333333333
Recall score: 0.4
Precision score: 0.2857142857142857


### Support Vector Machine

In [21]:
svm = svm.SVC(gamma='auto', class_weight='balanced')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(f'Accuracy Score: {svm.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.8387096774193549
F1 score: 0.0
Recall score: 0.0
Precision score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest

In [22]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f'Accuracy Score: {rf.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.8709677419354839
F1 score: 0.5
Recall score: 0.4
Precision score: 0.6666666666666666


## 2) One-hot Encoded

In [23]:
X = seq_encoded.todense()
y = data['response'].astype('bool')
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

### Logistic Regression

In [24]:
lr_l1 = LogisticRegression(C=1, tol=0.01, penalty='l1', solver='saga', class_weight='balanced')
lr_l1.fit(X_train, y_train)
y_pred = lr_l1.predict(X_test)
print(f'Accuracy Score: {lr_l1.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.7419354838709677
F1 score: 0.5000000000000001
Recall score: 0.8
Precision score: 0.36363636363636365


In [25]:
lr_l2 = LogisticRegression(C=1, tol=0.01, penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=2000)
lr_l2.fit(X_train, y_train)
y_pred = lr_l2.predict(X_test)
print(f'Accuracy Score: {lr_l2.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.9032258064516129
F1 score: 0.6666666666666665
Recall score: 0.6
Precision score: 0.75


In [26]:
lr_en = LogisticRegression(C=1, tol=0.01, penalty='elasticnet', solver='saga', l1_ratio=0.5, class_weight='balanced')
lr_en.fit(X_train, y_train)
y_pred = lr_en.predict(X_test)
print(f'Accuracy Score: {lr_en.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.7419354838709677
F1 score: 0.5000000000000001
Recall score: 0.8
Precision score: 0.36363636363636365


### Support Vector Machine

In [28]:
from sklearn import svm
svm = svm.SVC(gamma='auto', class_weight='balanced')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(f'Accuracy Score: {svm.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.8387096774193549
F1 score: 0.0
Recall score: 0.0
Precision score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest

In [29]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f'Accuracy Score: {rf.score(X_test, y_test)}')
print(f'F1 score: {f1_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test, y_pred)}')
print(f'Precision score: {precision_score(y_test, y_pred)}')

Accuracy Score: 0.8709677419354839
F1 score: 0.5
Recall score: 0.4
Precision score: 0.6666666666666666
