In [1]:
# Dataset: http://plmd.biocuckoo.org/download.php (Glycation)
# Generates list of patterns that cause (1) and don't cause (0) Glycation
# 2019-07
#
# Done by Team LegIt
# Rifaz Nahiyan, Sourav Saha
# Rafid Uddin Bhuiyan, Nadira Anjum
#
# Find the whole repo at: https://github.com/rifazn/bioinformatics-uni

import pandas as pd
import numpy as np
import os

df = pd.read_csv('assets/Glycation.elm', sep='\t')

In [2]:
df.head()

Unnamed: 0,PLMD ID,Uniprot Accession,Position,Type,Sequence,Species,PMIDs
0,PLMD-5,O00141,41,Glycation,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,Homo sapiens,21612289
1,PLMD-17,O00186,19,Glycation,MAPPVAERGLKSVVWQKIKATVFDDCKKEGEWKIMLLDEFTTKLLA...,Homo sapiens,21612289
2,PLMD-17,O00186,577,Glycation,MAPPVAERGLKSVVWQKIKATVFDDCKKEGEWKIMLLDEFTTKLLA...,Homo sapiens,21612289
3,PLMD-27,O00220,370,Glycation,MAPPPARVHLGAFLAVTPNPGSAASGTEAAAATPSKVWGSSAGRIE...,Homo sapiens,21612289
4,PLMD-34,O00238,202,Glycation,MLLRSAGKLNVGTKKEDGESTAPTPRPKVLRCKCHHHCPEDSVNNI...,Homo sapiens,21612289


In [3]:
data= np.array(df)
data = data[:100]

In [4]:
def get_pattern(prot, center, radius):
    if center < radius:
        pattern = (radius - center) * 'X' + prot[0:center + radius + 1]
    elif len(prot)- center <= radius:
        pattern = prot[center - radius : len(prot)] + (radius - (len(prot)-center-1)) * 'X'
    else:
        pattern = prot[center-radius:center+radius+1]
    return pattern

In [5]:
processed_ones = set()
radius=7

for row in data:
    center = row[2] - 1
    pattern = get_pattern(row[4], center, radius)
    #processed_ones.append([row[0], pattern, 1])
    processed_ones.add(pattern)
print(list(processed_ones)[:5])

['DPSISFSKPKKKKSF', 'AHSLLYKKVNEAQYR', 'ALVKVLDKWPLRSGG', 'GILPILVKCLERDDN', 'FSSFLKEKLNDTYVN']


In [6]:
processed_zeros = set()

for row in data:
    pos = -1
    
    while True:
        pos = row[4].find('K', pos + 1)
        if pos == -1:
            break
        pattern = get_pattern(row[4], pos, radius)
        if pattern not in processed_ones:
            #processed_zeros.append([row[0], pattern, 0])
            processed_zeros.add(pattern)
print(list(processed_zeros))

['VEAQKLMKYFNEFLD', 'WILTRNYKALSKGSK', 'VIQSADSKTKENVNA', 'IFRPNQQKPLPVSRW', 'DILAEYLKYRQFPFQ', 'AVPPQACKAPSSNTD', 'GYSALEIKSKMLALE', 'LFIDVDCKHPEAILT', 'FSHAQVAKKNQLVIM', 'PAFLEFLKQEQEASP', 'MWFKNLDKLIRLVNA', 'KHLHEGAKSETAEEL', 'KRIHSGEKPYECDEC', 'YKTAQAVKDFNREKL', 'PPPGKSGKYYYQLNS', 'AILIAFMKQRRMGLN', 'TPKAMATKDKIDKWD', 'YKHLYANKLENLEEM', 'GTNIAAGKALGIVAT', 'APQWCQGKLQAHLVA', 'IDEKSLIKGKTHSQL', 'RKLHKLYKMAHKKRS', 'TFMDSKMKPLWIMYS', 'ESIEIDQKLQEIMKQ', 'XXXXMTVKTEAAKGT', 'RGNAERVKEGRLSFY', 'INERYLNKLLSSGSR', 'REAIISQKRLGCNGL', 'GSNTFTVKAQPSDNA', 'YRSFHTDKLGEYKQP', 'SSSEVKRKKHKDEDW', 'VDPKLLNKLHEYFHE', 'VGKSSSKKKKKFHKA', 'AHTDDDSKPEADGDS', 'VLVTASVKEAAEAFL', 'AVGKSSSKKKKKFHK', 'YYKATVTKTAWYWYQ', 'EDIYAAKKHMKKCSS', 'QKVKGLKKLENFKKK', 'YECHVCRKVLTSSRN', 'KENSAPVKLGGFGVA', 'MAEGGDLKPPTPAST', 'AESLSEFKPFFGNII', 'QAESEGDKCGLERDE', 'VGTQWRGKSSPKVGG', 'TDFEVLTKVLQEEPP', 'WEKPQELKEKEKLEE', 'VKLSISYKNNKLFIM', 'EEEKAAQKAKPVATA', 'KKTEKKKKKRQDISV', 'AVYMFSMKRCPPGIW', 'CGQKERLKFEGLCCH', 'VPQSQQGKPL

In [7]:
processed_data = [[pattern, '1'] for pattern in processed_ones]
processed_data += [[pattern, '0'] for pattern in processed_zeros]

print(f"Number of 1s: {len(processed_ones)}")
print(f"Number of 0s: {len(processed_zeros)}")

Number of 1s: 100
Number of 0s: 3588


In [8]:
from pprint import pprint
processed_data = np.array(processed_data)
pprint(processed_data)

array([['DPSISFSKPKKKKSF', '1'],
       ['AHSLLYKKVNEAQYR', '1'],
       ['ALVKVLDKWPLRSGG', '1'],
       ...,
       ['GDIEGSCKKLSPPPL', '0'],
       ['AKEKTGGKVRQGQSQ', '0'],
       ['KTAGLEEKSTACQML', '0']], dtype='<U15')


In [9]:
outdf = pd.DataFrame({"Pattern": processed_data[:, 0], "Glycation": processed_data[:,1]})

In [10]:
outdf

Unnamed: 0,Pattern,Glycation
0,DPSISFSKPKKKKSF,1
1,AHSLLYKKVNEAQYR,1
2,ALVKVLDKWPLRSGG,1
3,GILPILVKCLERDDN,1
4,FSSFLKEKLNDTYVN,1
5,ETAEELKKVAQELEE,1
6,KTALDDRKEGAVLAK,1
7,TGEGLERKDASLLDN,1
8,AKSHPPDKWAQGAGA,1
9,HSPSPSSKFSTKGLC,1


In [11]:
with open('assets/ones.csv', 'w') as f:
    f.writelines(f">{idx}\n" + line + '\n' for idx, line in enumerate(processed_ones))

In [12]:
with open('assets/zeros.csv', 'w') as f:
    f.writelines(f">{idx}\n" + line + '\n' for idx, line in enumerate(processed_zeros))

In [13]:
def get_vectors_from_pssm_files(PSSM_DIR: 'directory for the pssm files'):
    vectors = []

    total_files = len([name for name in os.listdir(PSSM_DIR) if name.startswith("PSSM")])
    for i in range(1, total_files+1):
        if i == 10:
            continue
        matrix = []
        with open(PSSM_DIR + f'/PSSM.{i}') as f:
            lines = f.readlines()
            for i in range(3, 3+15):
                line = lines[i].split()
                l = line[2:2+20]
                l = list(map(int, l))
                matrix.append(l)
            matrix = np.matrix(matrix)
            t_matrix = matrix.transpose()
            matrix = t_matrix * matrix
            matrix = matrix[np.tril_indices(len(matrix))]
            vec = matrix.tolist()[0]
            vectors.append(vec)
    return vectors

v = get_vectors_from_pssm_files('assets/outs/positives')

In [21]:
v_neg = get_vectors_from_pssm_files('assets/outs/negatives')
print(v_neg[random.randint(0, 5)])
print(len(v_neg))

[73, 23, 82, 16, 56, 67, 8, 48, 95, 212, 67, 62, 41, 17, 133, 24, 68, 61, 81, 43, 81, 19, 67, 72, 129, 21, 90, 129, 50, 29, 50, 54, 67, 32, 31, 108, -7, 39, 42, 27, 21, 36, 31, 5, 97, 34, 27, 2, -34, 97, -2, -29, 18, 1, 110, 19, 26, 1, -37, 85, 1, -26, 8, 8, 102, 100, 22, 83, 53, 54, 32, 71, 79, 21, 17, -5, -8, 109, 14, 11, -8, -41, 53, 2, -26, -8, -2, 78, 80, -12, 85, -47, -66, -89, -163, -9, -119, -151, -75, -11, 83, 83, -102, 57, 281, 54, 80, 68, 73, 104, 72, 72, 71, 32, 53, 45, 67, 20, -78, 110, 45, 45, 47, 66, 52, 51, 59, 53, 13, 9, 1, 48, -6, -90, 66, 52, 37, 41, 34, 40, 65, 36, 35, 36, 11, 42, 35, 32, 22, -33, 62, 37, 39, -31, -15, -57, -122, 44, -43, -80, -26, -26, 62, 72, -52, 33, 134, -11, -47, -11, 265, -31, -19, -38, -88, 7, -48, -70, -43, 24, 45, 50, -48, 29, 145, -29, -46, -15, 101, 92, 49, 35, 10, -20, 105, 14, -9, 28, 1, 104, 95, 6, 74, 47, 66, 23, 49, 49, 27, 105]
3550


In [14]:
from sklearn import svm

In [85]:
import random

_x = []
while len(_x) < 101:
    _x.append(v_neg[random.randint(0, len(v_neg))])
X = np.array(v + _x)

In [86]:
clf = svm.SVC(gamma='scale')
y = 100 * [1] + 100 * [0]
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [87]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [88]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [89]:
clf_predictions = clf.predict(X_test)
print("Accuracy: {}%".format(clf.score(X_test, y_test) * 100 ))

Accuracy: 47.5%
