In [1]:
# Dataset: http://plmd.biocuckoo.org/download.php (Glycation)
# Generates list of patterns that cause (1) and don't cause (0) Glycation
# 2019-07
#
# Done by Team LegIt
# Rifaz Nahiyan, Sourav Saha
# Rafid Uddin Bhuiyan, Nadira Anjum
#
# Find the whole repo at: https://github.com/rifazn/bioinformatics-uni

import pandas as pd
import numpy as np
df = pd.read_csv('assets/Glycation.elm', sep='\t')

In [2]:
df.head()

Unnamed: 0,PLMD ID,Uniprot Accession,Position,Type,Sequence,Species,PMIDs
0,PLMD-5,O00141,41,Glycation,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,Homo sapiens,21612289
1,PLMD-17,O00186,19,Glycation,MAPPVAERGLKSVVWQKIKATVFDDCKKEGEWKIMLLDEFTTKLLA...,Homo sapiens,21612289
2,PLMD-17,O00186,577,Glycation,MAPPVAERGLKSVVWQKIKATVFDDCKKEGEWKIMLLDEFTTKLLA...,Homo sapiens,21612289
3,PLMD-27,O00220,370,Glycation,MAPPPARVHLGAFLAVTPNPGSAASGTEAAAATPSKVWGSSAGRIE...,Homo sapiens,21612289
4,PLMD-34,O00238,202,Glycation,MLLRSAGKLNVGTKKEDGESTAPTPRPKVLRCKCHHHCPEDSVNNI...,Homo sapiens,21612289


In [3]:
data= np.array(df)
data = data[:100]

In [4]:
def get_pattern(prot, center, radius):
    if center < radius:
        pattern = (radius - center) * 'X' + prot[0:center + radius + 1]
    elif len(prot)- center <= radius:
        pattern = prot[center - radius : len(prot)] + (radius - (len(prot)-center-1)) * 'X'
    else:
        pattern = prot[center-radius:center+radius+1]
    return pattern

In [5]:
processed_ones = set()
radius=7

for row in data:
    center = row[2] - 1
    pattern = get_pattern(row[4], center, radius)
    #processed_ones.append([row[0], pattern, 1])
    processed_ones.add(pattern)
print(list(processed_ones)[:5])

['IAHSLLYKKVNEAQY', 'QTRELKLKYLIIENF', 'VAQELEEKLNILNNN', 'GTRRMAKKNAIVRSL', 'ETHFQGDKESGGTGE']


In [6]:
processed_zeros = set()

for row in data:
    pos = -1
    
    while True:
        pos = row[4].find('K', pos + 1)
        if pos == -1:
            break
        pattern = get_pattern(row[4], pos, radius)
        if pattern not in processed_ones:
            #processed_zeros.append([row[0], pattern, 0])
            processed_zeros.add(pattern)
print(list(processed_zeros))

['YECDKCRKSFTSKRN', 'RGNAERVKEGRLSFY', 'PATKMNNKADGTPKT', 'QRIHSGEKTYECHVC', 'TQWLMVLKISLPVIG', 'DMYDQVLKFGAYIVD', 'PHTTRPPKKDEENGK', 'RDKYITAKADFRTLL', 'ALYAVIEKAKKARST', 'KKKKESSKELESPLT', 'FSSRPALKRYERLSY', 'DIRILAIKSTTLRVD', 'FSQGDLEKAMGNRPM', 'KEISSTKKATEGKTS', 'VLVSEENKDAIITAK', 'YYYQLNSKKHHPYQP', 'PVPGPQGKEGKSKSK', 'PEGTVEIKFRKKDLI', 'FRCSECGKAFRLRKQ', 'TQLIGNLKGNYQNLN', 'GGGKGIRKAESAEDF', 'NQQMVPIKEMTDVLK', 'SSEGIRKKLVEAEEL', 'GHSYSRAKVKFNVNR', 'GLTPDQVKRNLEKYG', 'HFLKVIGKGSFGKVL', 'QKIHTDEKPCECDVS', 'SRWTPFIKDIMEDAI', 'SAVQAARKLLSSDRN', 'QRVQNEVKIHCQLKH', 'LIFDKPEKNKQWGKD', 'IKDEGKDKALKSSQA', 'SIQRRHQKIVEEAPA', 'VEAVLSQKEVELKAS', 'ASQCPTEKSEVTPFP', 'VVTESTGKEREHNFQ', 'MEYYAAIKNDEFISF', 'VDPIQSVKGKVVIDA', 'IKLQSFVKEQLEECR', 'QKTFSICKERMRPVK', 'IVDDKFFKLSEMEAY', 'FMLKTLNKLGIDGMY', 'DSHKGTSKRLQGSVP', 'LQKKKDIKFKKLSLI', 'KFITRYTKKTSFLTE', 'INEDEPVKAKKRKRD', 'LSQEQKTKHRIFSLI', 'TTPFKDCKVLKQRPR', 'LEVHLKQKEQHYEEK', 'GTPKTESKSKKSSSS', 'GSYLKSTKLWIIMEY', 'WLAHSWCKNNSVILA', 'EIKEDTNKWK

In [7]:
processed_data = [[pattern, '1'] for pattern in processed_ones]
processed_data += [[pattern, '0'] for pattern in processed_zeros]

print(f"Number of 1s: {len(processed_ones)}")
print(f"Number of 0s: {len(processed_zeros)}")

Number of 1s: 100
Number of 0s: 3588


In [8]:
from pprint import pprint
processed_data = np.array(processed_data)
pprint(processed_data)

array([['IAHSLLYKKVNEAQY', '1'],
       ['QTRELKLKYLIIENF', '1'],
       ['VAQELEEKLNILNNN', '1'],
       ...,
       ['KDVYERLKDKWDELK', '0'],
       ['EVKRKKHKDEDWQMS', '0'],
       ['LVVLILVKYKGLKRV', '0']], dtype='<U15')


In [9]:
outdf = pd.DataFrame({"Pattern": processed_data[:, 0], "Glycation": processed_data[:,1]})

In [10]:
outdf

Unnamed: 0,Pattern,Glycation
0,IAHSLLYKKVNEAQY,1
1,QTRELKLKYLIIENF,1
2,VAQELEEKLNILNNN,1
3,GTRRMAKKNAIVRSL,1
4,ETHFQGDKESGGTGE,1
5,LEKELLEKKPWQLQG,1
6,AKRLQEEKEKVDKQY,1
7,AKQIQMVKQIGKGRY,1
8,EQIQILAKYSAADCP,1
9,PARSATPKVRLVEPH,1


In [11]:
with open('assets/ones.csv', 'w') as f:
    f.writelines(f">{idx}\n" + line + '\n' for idx, line in enumerate(processed_ones))

In [12]:
with open('assets/zeros.csv', 'w') as f:
    f.writelines(f">{idx}\n" + line + '\n' for idx, line in enumerate(processed_zeros))

In [97]:
def get_matrix_from_pssm_files(PSSM_DIR: 'directory for the pssm files'):
    
    def lower_triangle(matrix):
        return np.triu(matrix)
    total_files = len([name for name in os.listdir(PSSM_DIR) if name.startswith("PSSM")])
    for i in range(1, 1+1):
        matrix = []
        with open(PSSM_DIR + f'/PSSM.{i}') as f:
            lines = f.readlines()
            for i in range(3, 3+15):
                line = lines[i][10:71]
                l = [line[i:i+3] for i in range(0, len(line) - 1, 3)]
                l = list(map(int, l))
                matrix.append(l)
            print(matrix)
            pass
        

get_matrix_from_pssm_files('assets/outs/positives')

[[0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -3, -3, -3], [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0], [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3], [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3], [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1], [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0], [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2], [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2], [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3], [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0], [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3], [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4], [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3], [1, -1, 1, 

In [19]:
import os

print(len([name for name in os.listdir('assets/outs/positives/') if name.startswith("PSSM")]))

100


In [89]:
print(["""En empty matrix"""]+ ["et love rule"], sep='asdasdxcz')

['En empty matrix', 'et love rule']


In [87]:
?print

[1;31mDocstring:[0m
print(value, ..., sep=' ', end='\n', file=sys.stdout, flush=False)

Prints the values to a stream, or to sys.stdout by default.
Optional keyword arguments:
file:  a file-like object (stream); defaults to the current sys.stdout.
sep:   string inserted between values, default a space.
end:   string appended after the last value, default a newline.
flush: whether to forcibly flush the stream.
[1;31mType:[0m      builtin_function_or_method
