# Investigating regression - R enantiomer data

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import seaborn as sns

sns.set(rc={'figure.figsize': (5, 5)})
sns.set(font_scale=1.5)
sns.set_style('whitegrid')
sns.set_theme()
%config InlineBackend.figure_format = 'retina'

In [6]:
df = pd.read_pickle('ActiveSiteSeqs SeqVec.pkl')
df.set_index('IRED No. ', inplace=True)
df.head(2)

Unnamed: 0_level_0,Reaction 2,ee,Enantiomer,enantiomer binary,Sequence,ActiveSiteSequence,CofactorSiteSequence,ActiveSiteCharge,NumOfAcidicRes,NumOfBasicRes,NumOfHisRes,AllActiveSiteRes,embedding,em_per_protein
IRED No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,"62% (33% ee, S)",33.0,S,1.0,MSTKKVTVIGLGSLGSALAAALLRTGHDVTVWNRTPEKAEALVAQG...,WGMYASSINVALILSAVMAVPADLLLYGYL,WMASSIVAKIGLGSLGSALWNRTPEKVVCVFDTEAARELLNLTSGG...,-1.0,1.0,0.0,0.0,"[('204', 'TRP'), ('207', 'GLY'), ('208', 'MET'...","[[[0.16993958, -0.03258724, 0.05482708, -0.085...","[0.040156763, -0.117751405, -0.030865876, 0.09..."
2,"67% (46% ee, S)",46.0,S,1.0,MTDTSAKLTLLGLGAMGSALATAWLAADYDITVWNRTASRAEPLRT...,WAMYTSNMMEGNMTMTGIMAVPPMDVLLSMTF,WTSNMEGTLLGLGAMGSALWNRTASRAACLLDDASVSTLNLTTGGG...,-2.0,2.0,0.0,0.0,"[('209', 'TRP'), ('212', 'ALA'), ('213', 'MET'...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.05249873, -0.11437141, -0.12927642, -0.0184..."


In [7]:
r_df = df[df['Enantiomer'] == 'R']

In [8]:
r_df.head(2)

Unnamed: 0_level_0,Reaction 2,ee,Enantiomer,enantiomer binary,Sequence,ActiveSiteSequence,CofactorSiteSequence,ActiveSiteCharge,NumOfAcidicRes,NumOfBasicRes,NumOfHisRes,AllActiveSiteRes,embedding,em_per_protein
IRED No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
28,"66% (7% ee, R)",7.0,R,0.0,MTTISILGLGAMGTALANALLDAGHAVTVWNRTPGKDEALVSRGAR...,WSMLEGSNMQAAMQMSMTGIMAVPMFDILLGLDL,WGSNMAAQMLGLGAMGTALWNRTPGKPICLVDYAGVETMLNLTTGG...,-3.0,3.0,0.0,0.0,"[('205', 'TRP'), ('208', 'SER'), ('209', 'MET'...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.07435445, -0.10566954, -0.17261764, -0.0395..."
33,">99% (21% ee, R)",21.0,R,0.0,MENSPVTVFGLGAMGTALATQFLRKHHRTTVWNRTAAKAKSLIASG...,WAMHGSSMQHAVNVMSNGIMAVPMDLLLSMYF,WGSSMHANFGLGAMGTALWNRTAAKAIICQSNKDSVLQTLDLTNGH...,-1.0,1.0,0.0,2.0,"[('208', 'TRP'), ('211', 'ALA'), ('212', 'MET'...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.07825775, -0.08525519, -0.14122419, 0.00613..."


In [9]:
X = list(r_df['em_per_protein'])
y = r_df['ee']

SEED=25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=SEED)

In [12]:
reg = LinearRegression()
reg.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = reg.predict(X_test)

# The coefficients
reg_mse = mean_squared_error(y_test, y_pred)
# The coefficient of determination: 1 is perfect prediction
reg_r2_score = r2_score(y_test, y_pred)

reg_metrics = {'Linear regression':[reg_mse, reg_r2_score]}

full_sequence = pd.DataFrame(reg_metrics, index=['Mean squared error', 'r2 score'])

## Ridge

In [14]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)

ridge.fit(X_train, y_train)

# Make predictions using the testing set
y_pred_ridge = ridge.predict(X_test)

# The mean squared error
ridge_mse = mean_squared_error(y_test, y_pred_ridge)
# The coefficient of determination: 1 is perfect prediction
ridge_r2_score = r2_score(y_test, y_pred_ridge)

ridge_scores = [ridge_mse, ridge_r2_score]

full_sequence['Ridge'] = ridge_scores

## Lasso

In [28]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.2)

lasso.fit(X_train, y_train)

# Make predictions using the testing set
y_pred_lasso = lasso.predict(X_test)

# The mean squared error
lasso_mse = mean_squared_error(y_test, y_pred_lasso)
# The coefficient of determination: 1 is perfect prediction
lasso_r2_score = r2_score(y_test, y_pred_lasso)

lasso_scores = [lasso_mse, lasso_r2_score]

full_sequence['Lasso'] = lasso_scores

In [29]:
full_sequence

Unnamed: 0,Linear regression,Ridge,Lasso
Mean squared error,1170.719142,1162.052983,806.319993
r2 score,-0.050067,-0.042294,0.276778


## Using active site sequence data

In [32]:
as_df = pd.read_pickle('Active site and cofactor seqs.pkl')
as_df.set_index('IRED No. ', inplace=True)
as_df.head(2)

Unnamed: 0_level_0,Reaction 2,ee,Enantiomer,enantiomer binary,Sequence,ActiveSiteSequence,CofactorSiteSequence,ActiveSiteCharge,NumOfAcidicRes,NumOfBasicRes,NumOfHisRes,AllActiveSiteRes,as_embedding,as_em_per_protein,cf_embedding,cf_em_per_protein
IRED No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,"62% (33% ee, S)",33.0,S,1.0,MSTKKVTVIGLGSLGSALAAALLRTGHDVTVWNRTPEKAEALVAQG...,WGMYASSINVALILSAVMAVPADLLLYGYL,WMASSIVAKIGLGSLGSALWNRTPEKVVCVFDTEAARELLNLTSGG...,-1.0,1.0,0.0,0.0,"[('204', 'TRP'), ('207', 'GLY'), ('208', 'MET'...","[[[0.16993958, -0.03258724, 0.05482708, -0.085...","[0.040156763, -0.117751405, -0.030865876, 0.09...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.04040188, -0.06891214, -0.039861344, 0.0456..."
2,"67% (46% ee, S)",46.0,S,1.0,MTDTSAKLTLLGLGAMGSALATAWLAADYDITVWNRTASRAEPLRT...,WAMYTSNMMEGNMTMTGIMAVPPMDVLLSMTF,WTSNMEGTLLGLGAMGSALWNRTASRAACLLDDASVSTLNLTTGGG...,-2.0,2.0,0.0,0.0,"[('209', 'TRP'), ('212', 'ALA'), ('213', 'MET'...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.05249873, -0.11437141, -0.12927642, -0.0184...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.0434668, -0.089176714, -0.1193058, 0.033180..."


In [33]:
r_as_df = as_df[as_df['Enantiomer'] == 'R']

In [34]:
r_as_df.head(2)

Unnamed: 0_level_0,Reaction 2,ee,Enantiomer,enantiomer binary,Sequence,ActiveSiteSequence,CofactorSiteSequence,ActiveSiteCharge,NumOfAcidicRes,NumOfBasicRes,NumOfHisRes,AllActiveSiteRes,as_embedding,as_em_per_protein,cf_embedding,cf_em_per_protein
IRED No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
28,"66% (7% ee, R)",7.0,R,0.0,MTTISILGLGAMGTALANALLDAGHAVTVWNRTPGKDEALVSRGAR...,WSMLEGSNMQAAMQMSMTGIMAVPMFDILLGLDL,WGSNMAAQMLGLGAMGTALWNRTPGKPICLVDYAGVETMLNLTTGG...,-3.0,3.0,0.0,0.0,"[('205', 'TRP'), ('208', 'SER'), ('209', 'MET'...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.07435445, -0.10566954, -0.17261764, -0.0395...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.036015548, -0.1774074, -0.08800504, 0.05545..."
33,">99% (21% ee, R)",21.0,R,0.0,MENSPVTVFGLGAMGTALATQFLRKHHRTTVWNRTAAKAKSLIASG...,WAMHGSSMQHAVNVMSNGIMAVPMDLLLSMYF,WGSSMHANFGLGAMGTALWNRTAAKAIICQSNKDSVLQTLDLTNGH...,-1.0,1.0,0.0,2.0,"[('208', 'TRP'), ('211', 'ALA'), ('212', 'MET'...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.07825775, -0.08525519, -0.14122419, 0.00613...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[-0.003708722, -0.12719315, -0.08284199, 0.059..."


In [40]:
X = list(as_df['as_em_per_protein'])
y = as_df['ee']

SEED=25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=SEED)

# Linear regression model
reg_as = LinearRegression()
reg_as.fit(X_train, y_train)

y_pred_as = reg_as.predict(X_test)
reg_as_mse = mean_squared_error(y_test, y_pred_as)
reg_as_r2 = r2_score(y_test, y_pred_as)

reg_as_metrics = {'Linear regression':[reg_as_mse, reg_as_r2]}
active_site = pd.DataFrame(reg_as_metrics, index=['Mean squared error', 'r2 score'])

# Ridge regression model
ridge_as = Ridge(alpha=1.0)

ridge_as.fit(X_train, y_train)

y_pred_ridge_as = ridge_as.predict(X_test)
ridge_as_mse = mean_squared_error(y_test, y_pred_ridge_as)
ridge_as_r2 = r2_score(y_test, y_pred_ridge_as)

ridge_as_scores = [ridge_as_mse, ridge_as_r2]
active_site['Ridge'] = ridge_as_scores

# Lasso regression model
lasso_as = Lasso(alpha=0.2)

lasso_as.fit(X_train, y_train)

# Make predictions using the testing set
y_pred_lasso_as = lasso_as.predict(X_test)

# The mean squared error
lasso_as_mse = mean_squared_error(y_test, y_pred_lasso_as)
# The coefficient of determination: 1 is perfect prediction
lasso_as_r2 = r2_score(y_test, y_pred_lasso_as)

lasso_as_scores = [lasso_as_mse, lasso_as_r2]
active_site['Lasso'] = lasso_as_scores

In [41]:
r enantiomer active_site

Unnamed: 0,Linear regression,Ridge,Lasso
Mean squared error,2040.012649,1015.904459,975.006625
r2 score,-1.037238,-0.014523,0.02632


In [39]:
r enantiomer full_sequence

Unnamed: 0,Linear regression,Ridge,Lasso
Mean squared error,1170.719142,1162.052983,806.319993
r2 score,-0.050067,-0.042294,0.276778
