In [89]:
# Dataset: http://plmd.biocuckoo.org/download.php (Glycation)
# Generates list of patterns that cause (1) and don't cause (0) Glycation
# 2019-07
#
# Done by Team LegIt
# Rifaz Nahiyan, Sourav Saha
# Rafid Uddin Bhuiyan, Nadira Anjum
#
# Find the whole repo at: https://github.com/rifazn/bioinformatics-uni

import pandas as pd
import numpy as np
df = pd.read_csv('assets/Glycation.elm', sep='\t')

In [3]:
df.head()

Unnamed: 0,PLMD ID,Uniprot Accession,Position,Type,Sequence,Species,PMIDs
0,PLMD-5,O00141,41,Glycation,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,Homo sapiens,21612289
1,PLMD-17,O00186,19,Glycation,MAPPVAERGLKSVVWQKIKATVFDDCKKEGEWKIMLLDEFTTKLLA...,Homo sapiens,21612289
2,PLMD-17,O00186,577,Glycation,MAPPVAERGLKSVVWQKIKATVFDDCKKEGEWKIMLLDEFTTKLLA...,Homo sapiens,21612289
3,PLMD-27,O00220,370,Glycation,MAPPPARVHLGAFLAVTPNPGSAASGTEAAAATPSKVWGSSAGRIE...,Homo sapiens,21612289
4,PLMD-34,O00238,202,Glycation,MLLRSAGKLNVGTKKEDGESTAPTPRPKVLRCKCHHHCPEDSVNNI...,Homo sapiens,21612289


In [4]:
data= np.array(df)
data = data[:100]

In [70]:
def get_pattern(prot, center, radius):
    if center < radius:
        pattern = (radius - center) * 'X' + prot[0:center + radius + 1]
    elif len(prot)- center <= radius:
        pattern = prot[center - radius : len(prot)] + (radius - (len(prot)-center-1)) * 'X'
    else:
        pattern = prot[center-radius:center+radius+1]
    return pattern

In [72]:
processed_ones = set()
radius=7

for row in data:
    center = row[2] - 1
    pattern = get_pattern(row[4], center, radius)
    #processed_ones.append([row[0], pattern, 1])
    processed_ones.add(pattern)
print(list(processed_ones)[:5])

['AFQGAVQKELQHIVG', 'GQHSKPFKEFVEACL', 'IVSCPYVKTVATTKT', 'HSPSPSSKFSTKGLC', 'TLMLFFDKFANIVPF']


In [73]:
processed_zeros = set()

for row in data:
    pos = -1
    
    while True:
        pos = row[4].find('K', pos + 1)
        if pos == -1:
            break
        pattern = get_pattern(row[4], pos, radius)
        if pattern not in processed_ones:
            #processed_zeros.append([row[0], pattern, 0])
            processed_zeros.add(pattern)
print(list(processed_zeros))

['DLSSFVTKNVKEIEH', 'IVPVTKLKSKAPHWT', 'YFANVEEKDSRNDYC', 'VFVASFSKSMVLDAY', 'LWRRNDMKKKAKLFX', 'YQCSLCPKEFDSLPA', 'QLTDTQIKRNTFVGT', 'HIPYRDSKLTRLLQD', 'CPDNLFNKIKASCSK', 'ELETLAEKLNERKRD', 'IISIDAEKAFDKIQQ', 'KNAWTDTKVKKNSDA', 'PSEEGEVKDDGLEKS', 'ESEVNSLKEEIKMYM', 'LLKSVFVKNVGWATQ', 'TYNEHITKRVASSPA', 'LPKGRETKENYGKTL', 'RNYKALAKGTRGSTS', 'IFSSYKEKVLPWFEQ', 'KRMHSREKPYKCTEC', 'ELKLALEKEKARCAE', 'KALLISLKYGCFMWQ', 'ADFRTLLKETKFITY', 'KEISSTKKATEGKTS', 'INGPWFSKFDEDLAT', 'YMTALHSKPEIIQQE', 'RTYSSSLKRSSPRTI', 'TELIRQEKLEQLAAR', 'LQLKPYHKPLQHVRD', 'EYSDSDEKPLKGSLR', 'RSTKKKSKKADCPIA', 'MDLDEQLKILNLRFL', 'YECDECGKCFILKKS', 'IDFFRNIKFNYCILD', 'ELDLAQEKTGWFALG', 'LKEEWKQKPSIPPTL', 'VSRIQSPKRFADFVE', 'VDPEEKKKYCLTCRV', 'WTKPDGVKVIQQSEL', 'GAVSAVMKTIRIFQK', 'GTISQQTKLIDFLQA', 'QMLVCYAKELKEGFV', 'GSSSEANKQRRVPEA', 'HDDVGWLKTVDQYFY', 'KSIVPVTKLKSKAPH', 'MNFQRFLKFPDDPKV', 'EQSDSECKNGIPRSF', 'FSNPMALKSHMRTHA', 'TKEVVAIKIIDLEEA', 'YTNPEQIKQWRKNLW', 'QQLGEVVKLHPHELN', 'TGHVIAVKQMRRSGN', 'LTGEYISKDG

In [77]:
processed_data = [[pattern, '1'] for pattern in processed_ones]
processed_data += [[pattern, '0'] for pattern in processed_zeros]

print(f"Number of 1s: {len(processed_ones)}")
print(f"Number of 0s: {len(processed_zeros)}")

Number of 1s: 100
Number of 0s: 3588


In [81]:
from pprint import pprint
processed_data = np.array(processed_data)
pprint(processed_data)

array([['AFQGAVQKELQHIVG', '1'],
       ['GQHSKPFKEFVEACL', '1'],
       ['IVSCPYVKTVATTKT', '1'],
       ...,
       ['VVAYCKTKKQRKQMH', '0'],
       ['VSGGCDKKAMVWDMR', '0'],
       ['FSFQTADKLYFVLDY', '0']], dtype='<U15')


In [83]:
outdf = pd.DataFrame({"Pattern": processed_data[:, 0], "Glycation": processed_data[:,1]})

In [84]:
outdf

Unnamed: 0,Pattern,Glycation
0,AFQGAVQKELQHIVG,1
1,GQHSKPFKEFVEACL,1
2,IVSCPYVKTVATTKT,1
3,HSPSPSSKFSTKGLC,1
4,TLMLFFDKFANIVPF,1
5,SKCDSHLKHSLKLSW,1
6,VYDKGCVKDVDEGLE,1
7,RISQGPYKGYIGVVK,1
8,KLKDIMQKESLYWLT,1
9,LVNAQQAKGSSVHVL,1
