####
Here I convert the protein sequences in to vectors using the sgt module. Use of sgt for feature extraction from protein sequences because it Captures Sequential Dependencies: By transforming sequences into graph-based features, SGT captures the dependencies and transitions between amino acids.

In [1]:
#import sys
#!{sys.executable} -m pip install sgt

In [2]:
import numpy as np
import pandas as pd
from sgt import SGT # sequence to graph transform

In [3]:
df = pd.read_csv('prot_lig_aff.csv')
df.head()

Unnamed: 0,protein_seq,ligand_cid,kd,ligand_smile
0,MVRSVAWAGFMVLLMIPWGSAAKLVCYFTNWAQYRQGEARFLPKDL...,46511426.0,0.0,C1=CC=C(C(=C1)C#N)C2=CC=C(C=C2)COC(=O)CN3C(=O)...
1,MRFATSTIVK VALLLSSLCV DAAVMWNRDT SSTDLEARAS SG...,46511426.0,0.0,C1=CC=C(C(=C1)C#N)C2=CC=C(C=C2)COC(=O)CN3C(=O)...
2,MRFATSTIVK VALLLSSLCV DAAVMWNRDT SSTDLEARAS SG...,46511426.0,0.0,C1=CC=C(C(=C1)C#N)C2=CC=C(C=C2)COC(=O)CN3C(=O)...
3,MRFATSTIVK VALLLSSLCV DAAVMWNRDT SSTDLEARAS SG...,46511426.0,0.0,C1=CC=C(C(=C1)C#N)C2=CC=C(C=C2)COC(=O)CN3C(=O)...
4,MRFATSTIVK VALLLSSLCV DAAVMWNRDT SSTDLEARAS SG...,46511426.0,0.0,C1=CC=C(C(=C1)C#N)C2=CC=C(C=C2)COC(=O)CN3C(=O)...


In [4]:
def split(word):
    '''convert sequences into lists of letters'''
    word = word.replace(" ", "")
    return [char for char in word]

In [5]:
X = df["protein_seq"]
X

0        MVRSVAWAGFMVLLMIPWGSAAKLVCYFTNWAQYRQGEARFLPKDL...
1        MRFATSTIVK VALLLSSLCV DAAVMWNRDT SSTDLEARAS SG...
2        MRFATSTIVK VALLLSSLCV DAAVMWNRDT SSTDLEARAS SG...
3        MRFATSTIVK VALLLSSLCV DAAVMWNRDT SSTDLEARAS SG...
4        MRFATSTIVK VALLLSSLCV DAAVMWNRDT SSTDLEARAS SG...
                               ...                        
21874    MGCGCSSHPEDDWMENIDVCENCHYPIVPLDGKGTLLIRNGSEVRD...
21875    MGCGCSSHPEDDWMENIDVCENCHYPIVPLDGKGTLLIRNGSEVRD...
21876    MGCGCSSHPEDDWMENIDVCENCHYPIVPLDGKGTLLIRNGSEVRD...
21877    MGCGCSSHPEDDWMENIDVCENCHYPIVPLDGKGTLLIRNGSEVRD...
21878    MGCGCSSHPEDDWMENIDVCENCHYPIVPLDGKGTLLIRNGSEVRD...
Name: protein_seq, Length: 21879, dtype: object

In [6]:
sequences = [split(x) for x in X]
print(sequences[0])

['M', 'V', 'R', 'S', 'V', 'A', 'W', 'A', 'G', 'F', 'M', 'V', 'L', 'L', 'M', 'I', 'P', 'W', 'G', 'S', 'A', 'A', 'K', 'L', 'V', 'C', 'Y', 'F', 'T', 'N', 'W', 'A', 'Q', 'Y', 'R', 'Q', 'G', 'E', 'A', 'R', 'F', 'L', 'P', 'K', 'D', 'L', 'D', 'P', 'S', 'L', 'C', 'T', 'H', 'L', 'I', 'Y', 'A', 'F', 'A', 'G', 'M', 'T', 'N', 'H', 'Q', 'L', 'S', 'T', 'T', 'E', 'W', 'N', 'D', 'E', 'T', 'L', 'Y', 'Q', 'E', 'F', 'N', 'G', 'L', 'K', 'K', 'M', 'N', 'P', 'K', 'L', 'K', 'T', 'L', 'L', 'A', 'I', 'G', 'G', 'W', 'N', 'F', 'G', 'T', 'Q', 'K', 'F', 'T', 'D', 'M', 'V', 'A', 'T', 'A', 'N', 'N', 'R', 'Q', 'T', 'F', 'V', 'N', 'S', 'A', 'I', 'R', 'F', 'L', 'R', 'K', 'Y', 'S', 'F', 'D', 'G', 'L', 'D', 'L', 'D', 'W', 'E', 'Y', 'P', 'G', 'S', 'Q', 'G', 'S', 'P', 'A', 'V', 'D', 'K', 'E', 'R', 'F', 'T', 'T', 'L', 'V', 'Q', 'D', 'L', 'A', 'N', 'A', 'F', 'Q', 'Q', 'E', 'A', 'Q', 'T', 'S', 'G', 'K', 'E', 'R', 'L', 'L', 'L', 'S', 'A', 'A', 'V', 'P', 'A', 'G', 'Q', 'T', 'Y', 'V', 'D', 'A', 'G', 'Y', 'E', 'V', 'D', 'K', 'I',

In [7]:
corpus = pd.DataFrame([[i, sequences[i]] for i in range(len(sequences))], columns=['id', 'sequence'])
corpus.head()
# corpus dataframe

Unnamed: 0,id,sequence
0,0,"[M, V, R, S, V, A, W, A, G, F, M, V, L, L, M, ..."
1,1,"[M, R, F, A, T, S, T, I, V, K, V, A, L, L, L, ..."
2,2,"[M, R, F, A, T, S, T, I, V, K, V, A, L, L, L, ..."
3,3,"[M, R, F, A, T, S, T, I, V, K, V, A, L, L, L, ..."
4,4,"[M, R, F, A, T, S, T, I, V, K, V, A, L, L, L, ..."


In [8]:
# Learning the sgt embeddings as vector for  all sequences in a corpus.
# mode: 'default'
# sgt == sequence to graph transform

sgt = SGT(kappa=5, 
          flatten=True, 
          lengthsensitive=False, 
          mode='default')
ans = sgt.fit_transform(corpus)
ans

Unnamed: 0,id,"(#, #)","(#, &)","(#, -)","(#, 3)","(#, 5)","(#, 9)","(#, ;)","(#, A)","(#, C)",...,"(y, m)","(y, n)","(y, p)","(y, q)","(y, r)","(y, s)","(y, t)","(y, v)","(y, w)","(y, y)"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21874,21874.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21875,21875.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21876,21876.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21877,21877.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#ans.to_csv('protein_vecs.csv', index = False)

In [10]:
ans

Unnamed: 0,id,"(#, #)","(#, &)","(#, -)","(#, 3)","(#, 5)","(#, 9)","(#, ;)","(#, A)","(#, C)",...,"(y, m)","(y, n)","(y, p)","(y, q)","(y, r)","(y, s)","(y, t)","(y, v)","(y, w)","(y, y)"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21874,21874.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21875,21875.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21876,21876.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21877,21877.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#ans = sgt.fit_transform(corpus)
#ans
df_features = pd.DataFrame(ans)
df_features

Unnamed: 0,id,"(#, #)","(#, &)","(#, -)","(#, 3)","(#, 5)","(#, 9)","(#, ;)","(#, A)","(#, C)",...,"(y, m)","(y, n)","(y, p)","(y, q)","(y, r)","(y, s)","(y, t)","(y, v)","(y, w)","(y, y)"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21874,21874.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21875,21875.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21876,21876.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21877,21877.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Generate proper column names
num_features = df_features.shape[1]
column_names = [f'protein_feature_{i}' for i in range(1, num_features + 1)]
df_features.columns = column_names

# Combine the df_features with the original DataFrame 
df_final = pd.concat([df_features, df[['kd']]], axis=1)

# Display the final DataFrame with proper column names
print(df_final.tail())

       protein_feature_1  protein_feature_2  protein_feature_3   
21874            21874.0                0.0                0.0  \
21875            21875.0                0.0                0.0   
21876            21876.0                0.0                0.0   
21877            21877.0                0.0                0.0   
21878            21878.0                0.0                0.0   

       protein_feature_4  protein_feature_5  protein_feature_6   
21874                0.0                0.0                0.0  \
21875                0.0                0.0                0.0   
21876                0.0                0.0                0.0   
21877                0.0                0.0                0.0   
21878                0.0                0.0                0.0   

       protein_feature_7  protein_feature_8  protein_feature_9   
21874                0.0                0.0                0.0  \
21875                0.0                0.0                0.0   
21876   

In [13]:
df_final.to_csv('protein_vecs.csv', index = False)