# Investigating regression - ZScales featurisation data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from Bio import AlignIO

import seaborn as sns

SEED = 25

In [2]:
df = pd.read_pickle('ActiveSiteSeqs SeqVec.pkl')
df.reset_index(inplace=True)

In [3]:
# Changing the e.e. values for R enantiomer entries to the negative value before regression modelling
new_ee = []
for i in range(0, 205):
    ee = df['ee'].loc[i]
    if df.loc[i].iat[4] == 'R':
        new_ee_value = -ee
    else:
        new_ee_value = ee
    new_ee.append(new_ee_value)

In [8]:
df['ee'] = new_ee
df.head(2)

Unnamed: 0,index,IRED No.,Reaction 2,ee,Enantiomer,enantiomer binary,Sequence,ActiveSiteSequence,CofactorSiteSequence,ActiveSiteCharge,NumOfAcidicRes,NumOfBasicRes,NumOfHisRes,AllActiveSiteRes,embedding,em_per_protein
0,0,1,"62% (33% ee, S)",33.0,S,1.0,MSTKKVTVIGLGSLGSALAAALLRTGHDVTVWNRTPEKAEALVAQG...,WGMYASSINVALILSAVMAVPADLLLYGYL,WMASSIVAKIGLGSLGSALWNRTPEKVVCVFDTEAARELLNLTSGG...,-1.0,1.0,0.0,0.0,"[('204', 'TRP'), ('207', 'GLY'), ('208', 'MET'...","[[[0.16993958, -0.03258724, 0.05482708, -0.085...","[0.040156763, -0.117751405, -0.030865876, 0.09..."
1,1,2,"67% (46% ee, S)",46.0,S,1.0,MTDTSAKLTLLGLGAMGSALATAWLAADYDITVWNRTASRAEPLRT...,WAMYTSNMMEGNMTMTGIMAVPPMDVLLSMTF,WTSNMEGTLLGLGAMGSALWNRTASRAACLLDDASVSTLNLTTGGG...,-2.0,2.0,0.0,0.0,"[('209', 'TRP'), ('212', 'ALA'), ('213', 'MET'...","[[[0.16993958, -0.032587238, 0.05482708, -0.08...","[0.05249873, -0.11437141, -0.12927642, -0.0184..."


In [9]:
# Extracting sequences as multiple alignments into a numpy array 
alignment = AlignIO.read('aligned ireds.clustal', 'clustal')
align_array = np.array([list(rec) for rec in alignment], np.dtype(np.str_))
print('Array shape %i by %i' % align_array.shape)

Array shape 383 by 1000


In [10]:
# Extracting IRED number from alignments, ready to assign to index in dataframe
align_index = AlignIO.read('aligned ireds.clustal', 'clustal')
ired_names = []
for record in align_index:
    ired_names.append(record.id)

In [11]:
# Creating a dictionary of the z5 scores for each AA as well as the gaps. Another dimension added to the vector to
# account for gaps in the sequences, binary classifiction (1=gap, 0=AA)
z5_dict = {
    'A': [0.24, -2.32, 0.60, -0.14, 1.30, 0.0],
    'R': [3.52, 2.50, -3.50, 1.99, -0.17, 0.0],
    'N': [3.05, 1.62, 1.04, -1.15, 1.61, 0.0],
    'D': [3.98, 0.93, 1.93, -2.46, 0.75, 0.0],
    'C': [0.84, -1.67, 3.71, 0.18, -2.65, 0.0],
    'Q': [1.75, 0.50, -1.44, -1.34, 0.66, 0.0],
    'E': [3.11, 0.26, -0.11, -3.04, -0.25, 0.0],
    'G': [2.05, -4.06, 0.36, -0.82, -0.38, 0.0],
    'H': [2.47, 1.95, 0.26, 3.90, 0.09, 0.0],
    'I': [-3.89, -1.73, -1.71, -0.84, 0.26, 0.0],
    'L': [-4.28, -1.30, -1.49, -0.72, 0.84, 0.0],
    'K': [2.29, 0.89, -2.49, 1.49, 0.31, 0.0],
    'M': [-2.85, -0.22, 0.47, 1.94, -0.98, 0.0],
    'F': [-4.22, 1.94, 1.06, 0.54, -0.62, 0.0],
    'P': [-1.66, 0.27, 1.84, 0.70, 2.00, 0.0],
    'S': [2.39, -1.07, 1.15, -1.39, 0.67, 0.0],
    'T': [0.75, -2.18, -1.12, -1.46, -0.40, 0.0],
    'W': [-4.36, 3.94, 0.59, 3.44, -1.59, 0.0],
    'Y': [-2.54, 2.44, 0.43, 0.04, -1.47, 0.0],
    'V': [-2.59, -2.64, -1.54, -0.85, -0.02, 0.0],
    '-': [0.0, 0.0, 0.0, 0.0, 0.0, 1],
    '*': [0.0, 0.0, 0.0, 0.0, 0.0, 1]
}

In [12]:
feat_seqs = {}
for i, sequence in enumerate(align_array):
    single_feat = []
    for aa in sequence:
        indices = z5_dict.get(aa)
        single_feat.extend(indices)
    feat_seqs[ired_names[i]] = single_feat

In [13]:
labels = {}
for index, row in df.iterrows():
    labels[str(row['IRED No. '])] = row['ee']

In [14]:
ee_list = []
descrip_list = []
for ired_name in labels:
    ee_list.append(labels.get(ired_name))
    descrip_list.append(feat_seqs.get(ired_name))

In [15]:
X = descrip_list
y = ee_list 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=SEED)

In [16]:
ee_list

[33.0,
 46.0,
 35.0,
 35.0,
 75.0,
 90.0,
 87.0,
 83.0,
 99.0,
 45.0,
 95.0,
 43.0,
 74.0,
 89.0,
 99.0,
 96.0,
 0.69,
 -7.0,
 79.0,
 -21.0,
 90.0,
 75.0,
 89.0,
 95.0,
 -93.0,
 93.0,
 87.0,
 93.0,
 89.0,
 95.0,
 59.0,
 93.0,
 -27.0,
 90.0,
 -90.0,
 70.0,
 79.0,
 99.0,
 98.0,
 98.0,
 59.0,
 90.0,
 98.0,
 90.0,
 90.0,
 90.0,
 95.0,
 83.0,
 70.0,
 91.0,
 89.0,
 79.0,
 87.0,
 -75.0,
 -40.0,
 87.0,
 81.0,
 79.0,
 99.0,
 99.0,
 99.0,
 87.0,
 99.0,
 99.0,
 99.0,
 99.0,
 99.0,
 99.0,
 90.0,
 99.0,
 99.0,
 96.0,
 99.0,
 -97.0,
 99.0,
 -65.0,
 85.0,
 99.0,
 99.0,
 -40.0,
 99.0,
 89.0,
 99.0,
 91.0,
 99.0,
 98.0,
 91.0,
 98.0,
 91.0,
 99.0,
 98.0,
 83.0,
 -1.0,
 99.0,
 85.0,
 99.0,
 99.0,
 99.0,
 99.0,
 -73.0,
 99.0,
 -93.0,
 37.0,
 1.0,
 18.0,
 1.0,
 -2.0,
 -2.0,
 -95.0,
 -98.0,
 33.0,
 4.0,
 1.0,
 -81.0,
 97.0,
 -89.0,
 -73.0,
 -31.0,
 -99.0,
 43.0,
 99.0,
 99.0,
 -97.0,
 99.0,
 99.0,
 15.0,
 -97.0,
 98.0,
 99.0,
 -91.0,
 99.0,
 99.0,
 99.0,
 99.0,
 99.0,
 55.0,
 83.0,
 91.0,
 -95.0,
 -97.0,
 

## Linear regression model

In [17]:
# Linear regression model
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
reg_mse = mean_squared_error(y_test, y_pred)
reg_r2_score = r2_score(y_test, y_pred)

reg_metrics = {'Linear regression':[reg_mse, reg_r2_score]}

results_df = pd.DataFrame(reg_metrics, index=['Mean squared error', 'r2 score'])

# Ridge regression model
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)

ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_r2_score = r2_score(y_test, y_pred_ridge)
ridge_scores = [ridge_mse, ridge_r2_score]
results_df['Ridge'] = ridge_scores

# Lasso regression model
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.2)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
lasso_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_r2_score = r2_score(y_test, y_pred_lasso)
lasso_scores = [lasso_mse, lasso_r2_score]

results_df['Lasso'] = lasso_scores

  model = cd_fast.enet_coordinate_descent(


In [18]:
results_df

Unnamed: 0,Linear regression,Ridge,Lasso
Mean squared error,4.394796e+25,3035.247554,2961.99769
r2 score,-8.384122e+21,0.420954,0.434928


In [84]:
# Creating numpy arrays of actual and predicted e.e. values for lasso and ridge regression models
y_lasso = np.column_stack((y_test, y_pred_lasso))
y_ridge = np.column_stack((y_test, y_pred_ridge))

In [74]:
# Obtaining the ref. number of IREDs used in the test set
ired_list = []
for i in X_test:
    for key, value in feat_seqs.items():
        if i == value:
            ired_no = key
            break
    ired_list.append(key)

In [87]:
# Finding squared error for each test item (squared difference of actual and predicted)
y_lasso = np.square(np.diff(y_lasso))
y_ridge = np.square(np.diff(y_ridge))

In [80]:
# Creating a dictionary for each regression model, containing IRED number and squared error as key and value resp.
lasso_results = dict(zip(ired_list, y_lasso))

In [93]:
# Two separate cells were required, otherwise ridge_results was unpopulated
ridge_results = dict(zip(ired_list, y_ridge))

In [77]:
lasso_results

{'51': array([839.81380595]),
 '197': array([1607.58785981]),
 '356': array([1980.80643103]),
 '69': array([1409.66934442]),
 '106': array([3.31035322]),
 '178': array([4931.94721369]),
 '92': array([14.93002535]),
 '243': array([626.14282665]),
 '248': array([60.86258033]),
 '202': array([2270.55436801]),
 '94': array([64.91569069]),
 '330': array([552.26025036]),
 '118': array([2805.83757425]),
 '149': array([66.9573207]),
 '66': array([1014.6775107]),
 '115': array([14259.89395298]),
 '175': array([5128.3265961]),
 '208': array([14497.26577134]),
 '194': array([1743.59960057]),
 '170': array([7840.2573835]),
 '259': array([2919.99286336]),
 '271': array([2482.10646984]),
 '307': array([12.74735397]),
 '267': array([1282.40536113]),
 '13': array([514.91219739]),
 '112': array([1651.05200355]),
 '155': array([805.0171699]),
 '8': array([1034.34376402]),
 '134': array([3579.32316599]),
 '78': array([7667.16742092]),
 '162': array([8270.44556073]),
 '295': array([230.44076033]),
 '67': 

In [94]:
ridge_results

{'51': array([185.82205354]),
 '197': array([1797.414702]),
 '356': array([1852.22451679]),
 '69': array([3562.37446124]),
 '106': array([1.01052891]),
 '178': array([1617.69464281]),
 '92': array([2.47347463]),
 '243': array([214.84031313]),
 '248': array([627.17417123]),
 '202': array([3283.82995726]),
 '94': array([80.92099376]),
 '330': array([6.83217381]),
 '118': array([7274.90498845]),
 '149': array([2315.26079032]),
 '66': array([971.95667361]),
 '115': array([9790.07747232]),
 '175': array([7139.01971242]),
 '208': array([18445.46830112]),
 '194': array([22.76302101]),
 '170': array([1714.88204256]),
 '259': array([8857.92830892]),
 '271': array([589.25702957]),
 '307': array([8.15016223]),
 '267': array([44.62779851]),
 '13': array([510.96779987]),
 '112': array([4629.7599782]),
 '155': array([3.69479671]),
 '8': array([80.02360588]),
 '134': array([2190.18955605]),
 '78': array([4220.54916298]),
 '162': array([9599.75113449]),
 '295': array([451.62533676]),
 '67': array([176