In [1]:
# Dataset: http://plmd.biocuckoo.org/download.php (Glycation)
# Generates list of patterns that cause (1) and don't cause (0) Glycation
# 2019-07
#
# Done by Team LegIt
# Rifaz Nahiyan, Sourav Saha
# Rafid Uddin Bhuiyan, Nadira Anjum
#
# Find the whole repo at: https://github.com/rifazn/bioinformatics-uni

import pandas as pd
import numpy as np
import os

df = pd.read_csv('assets/Glycation.elm', sep='\t')

In [2]:
df.head()

Unnamed: 0,PLMD ID,Uniprot Accession,Position,Type,Sequence,Species,PMIDs
0,PLMD-5,O00141,41,Glycation,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,Homo sapiens,21612289
1,PLMD-17,O00186,19,Glycation,MAPPVAERGLKSVVWQKIKATVFDDCKKEGEWKIMLLDEFTTKLLA...,Homo sapiens,21612289
2,PLMD-17,O00186,577,Glycation,MAPPVAERGLKSVVWQKIKATVFDDCKKEGEWKIMLLDEFTTKLLA...,Homo sapiens,21612289
3,PLMD-27,O00220,370,Glycation,MAPPPARVHLGAFLAVTPNPGSAASGTEAAAATPSKVWGSSAGRIE...,Homo sapiens,21612289
4,PLMD-34,O00238,202,Glycation,MLLRSAGKLNVGTKKEDGESTAPTPRPKVLRCKCHHHCPEDSVNNI...,Homo sapiens,21612289


In [3]:
data= np.array(df)
data = data[:100]

In [4]:
def get_pattern(prot, center, radius):
    if center < radius:
        pattern = (radius - center) * 'X' + prot[0:center + radius + 1]
    elif len(prot)- center <= radius:
        pattern = prot[center - radius : len(prot)] + (radius - (len(prot)-center-1)) * 'X'
    else:
        pattern = prot[center-radius:center+radius+1]
    return pattern

In [5]:
processed_ones = set()
radius=7

for row in data:
    center = row[2] - 1
    pattern = get_pattern(row[4], center, radius)
    #processed_ones.append([row[0], pattern, 1])
    processed_ones.add(pattern)
print(list(processed_ones)[:5])

['SISFSKPKKKKSFSK', 'IAHSLLYKKVNEAQY', 'KWEGGVNKMFHGIQD', 'ETAEELKKVAQELEE', 'PKKKKKQKPQEVPQE']


In [6]:
processed_zeros = set()

for row in data:
    pos = -1
    
    while True:
        pos = row[4].find('K', pos + 1)
        if pos == -1:
            break
        pattern = get_pattern(row[4], pos, radius)
        if pattern not in processed_ones:
            #processed_zeros.append([row[0], pattern, 0])
            processed_zeros.add(pattern)
print(list(processed_zeros))

['ILKKKEEKHIMSERN', 'LKEHYEKKMRDLMAS', 'PGKPEMMKSPTNTTP', 'LLVTKWNKHEDVAQM', 'LRFCNPGKEYRPCDP', 'VVAPDNLKQVCSGEQ', 'SDVQTAIKSELLMII', 'QEELKWRKIQYMARG', 'HNDNIMLKTTGHMFH', 'SQKTFKNKETLIIEP', 'LERKCMEKEKCKKPS', 'KTEEEALKHFRVKFN', 'REVSDRFKLPPGEYI', 'ANNSYACKHPEVQSI', 'TTYEILLKDKTVLGS', 'LRKGLEKKGAVTGGE', 'KFSSSDRKKQREFEE', 'NGIGGKMKIVKNKNK', 'CLPVMVLKAKKPFTF', 'DVGDFVLKPCGLEEF', 'TSTLAFQKEQKLKCE', 'RDATRGWKEGRGPLD', 'PKTESKSKKSSSSTT', 'ELKLALEKEKARCAE', 'MDQPAKKKKGLFSRR', 'DPKEEPIKEIKEEPK', 'KDKNWHDKGQQYRNW', 'NAFNLAEKELGLTKL', 'ENECRLLKKVKSEKM', 'NFIPPEEKNKIMNRL', 'QVIEKTGKSYTLKSE', 'DSQDKCHKMEQEMTR', 'XXMESKYKEILLLTG', 'SDGRERSKKSSVSDA', 'FAIKSLRKLTDDELF', 'LSNQGTIKLCDFGSA', 'SALRNDPKPLPQQPP', 'GLLLILGKLILLHHK', 'KKARSTKKKSKKADC', 'RNGRGRGKRMRPNSN', 'PRGMVKVKDCTPWSD', 'TVKNTLNKVVLADYE', 'LAPSFPDKQRWVTAL', 'SELANTAKADVPYIL', 'QEVEVKTKKLKKLYA', 'LSEQGDVKLADFGVA', 'ASWPPQAKPPPKACT', 'QKLQEIMKQTGYLTI', 'GQVVEDKKSIQLKDL', 'LGQFPDIKSRIAKRG', 'RGRALPGKNLPSLAK', 'AEKVQAEKPDIMLGV', 'RLKAQLEKRG

In [7]:
processed_data = [[pattern, '1'] for pattern in processed_ones]
processed_data += [[pattern, '0'] for pattern in processed_zeros]

print(f"Number of 1s: {len(processed_ones)}")
print(f"Number of 0s: {len(processed_zeros)}")

Number of 1s: 100
Number of 0s: 3588


In [8]:
from pprint import pprint
processed_data = np.array(processed_data)
pprint(processed_data)

array([['SISFSKPKKKKSFSK', '1'],
       ['IAHSLLYKKVNEAQY', '1'],
       ['KWEGGVNKMFHGIQD', '1'],
       ...,
       ['QYMARGEKSLAYHEW', '0'],
       ['AGSTSIPKRKKSTPK', '0'],
       ['PPVTSPVKAPTPSGQ', '0']], dtype='<U15')


In [9]:
outdf = pd.DataFrame({"Pattern": processed_data[:, 0], "Glycation": processed_data[:,1]})

In [10]:
outdf

Unnamed: 0,Pattern,Glycation
0,SISFSKPKKKKSFSK,1
1,IAHSLLYKKVNEAQY,1
2,KWEGGVNKMFHGIQD,1
3,ETAEELKKVAQELEE,1
4,PKKKKKQKPQEVPQE,1
5,SKCDSHLKHSLKLSW,1
6,KIPYSFFKTALDDRK,1
7,PSSKFSTKGLCKKKL,1
8,EKILPQLKCHFTWNL,1
9,VAQELEEKLNILNNN,1


In [11]:
with open('assets/ones.csv', 'w') as f:
    f.writelines(f">{idx}\n" + line + '\n' for idx, line in enumerate(processed_ones))

In [12]:
with open('assets/zeros.csv', 'w') as f:
    f.writelines(f">{idx}\n" + line + '\n' for idx, line in enumerate(processed_zeros))

In [50]:
def get_vectors_from_pssm_files(PSSM_DIR: 'directory for the pssm files'):
    vectors = []

    total_files = len([name for name in os.listdir(PSSM_DIR) if name.startswith("PSSM")])
    for i in range(1, total_files+1):
        if i == 10:
            continue
        matrix = []
        with open(PSSM_DIR + f'/PSSM.{i}') as f:
            lines = f.readlines()
            for i in range(3, 3+15):
                line = lines[i].split()
                l = line[2:2+20]
                l = list(map(int, l))
                matrix.append(l)
            matrix = np.matrix(matrix)
            t_matrix = matrix.transpose()
            matrix = t_matrix * matrix
            matrix = matrix[np.tril_indices(len(matrix))]
            vec = matrix.tolist()[0]
            vectors.append(vec)
    return vectors

v = get_vectors_from_pssm_files('assets/outs/positives')