In [2]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from sklearn.impute import SimpleImputer
%matplotlib inline

In [9]:
resp = pd.read_csv('../data/staph/nrs_metadata3.txt', sep='\t')
resp.rename(columns = {'sample_tag':'id', 'Total.Area':'area'}, inplace=True)
src = SeqIO.parse('../data/staph/core_gene_alignment-narsa.aln', 'fasta')
seq = [(record.id, record.seq._data.upper()) for record in src]
seq_df = pd.DataFrame(data = seq, columns = ['id', 'sequence'])
data = pd.merge(seq_df, resp, on = 'id')[['id', 'sequence', 'area']]
data

Unnamed: 0,id,sequence,area
0,NRS001,ATGAACATTTATGATGAATATAGAAGTTATTTAATAGAAGAACTGG...,0.000000
1,NRS002,----------------------------------------------...,0.000000
2,NRS003,ATGAACATTTATGATGAATATAGAAGTTATTTAATAGAAGAACTGG...,0.000000
3,NRS021,ATGAAAATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,473.151678
4,NRS022,ATGAACATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,6686.805947
...,...,...,...
119,NRS272,ATGAAAATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,15672.551160
120,NRS275,ATGAAAATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,9560.675684
121,NRS383,ATGAACATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,2600.718086
122,NRS386,ATGAAAATTTATGATGAATATAGAAGTTATGTAATAGAAGAACTGG...,20275.395700


# Encoding

In [5]:
from numpy import array
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [10]:
# integer encoding
def base2integer(str):
    label_encoder = LabelEncoder()
    label_encoder.fit(['A','T','C','G','N','-'])
    for i in str.split():
        return label_encoder.transform(list(i))

def integer(series):
    label_encoded = series.apply(base2integer)
    return pd.DataFrame(label_encoded.to_dict()).transpose()

data.dropna(inplace=True)
label_encoded = integer(data['sequence'])

In [11]:
# one-hot encoding
def onehot(series):
    label_encoded = integer(series).to_numpy()   
    onehot_encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    onehot_encoded = onehot_encoder.fit_transform(label_encoded)
    return onehot_encoded

seq_encoded = onehot(data['sequence'])
seq_encoded

<124x1509775 sparse matrix of type '<class 'numpy.float64'>'
	with 121902912 stored elements in Compressed Sparse Row format>

# Imputation

In [12]:
seq_imputed = data['sequence'].replace('-', np.NaN)
naive = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
naive.fit(seq_encoded)

# Training

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor

## 1) Integer Encoded

In [15]:
X = label_encoded
y = data['area']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Linear Regression

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f'R^2: {r2_score(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

R^2: -0.7812493268386256
MAE: 9438.549066078487
MSE: 167417788.21735057
RMSE: 12939.002597470586


### Support Vector Machine

In [18]:
svm = svm.SVR(kernel = "linear", C=0.01)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(f'R^2: {r2_score(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

R^2: -0.007576700094627409
MAE: 7018.458236966264
MSE: 94701095.4881678
RMSE: 9731.448786700148


### Random Forest

In [19]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f'R^2: {r2_score(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

R^2: -0.2218611771031498
MAE: 7976.63064048964
MSE: 114841472.6097411
RMSE: 10716.411368071922


## 2) One-hot Encoded

In [20]:
X = seq_encoded.todense()
y = data['area']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Linear Regression

In [21]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f'R^2: {r2_score(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

R^2: -0.004940734754247611
MAE: 8567.093176603788
MSE: 118900184.01109944
RMSE: 10904.136096504822


### Support Vector Machine

In [23]:
from sklearn import svm
svm = svm.SVR(kernel = "linear", C=0.01)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(f'R^2: {r2_score(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

R^2: 0.02134396305167041
MAE: 8249.866790727183
MSE: 115790293.74820343
RMSE: 10760.589842020901


### Random Forest

In [24]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f'R^2: {r2_score(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

R^2: 0.16807169661889998
MAE: 7777.79309452307
MSE: 98430111.28436741
RMSE: 9921.195053236652
