In [1]:
import tkinter as tk
import requests
import matplotlib.pyplot as plt
from tkinter import messagebox
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
from ml_models import *

train_x_shape = (18286, 21, 20)
test_x_shape  = (2372, 21, 20)
train_y_shape = (18286, 2)

params = default_params.copy()
params.update({
    'window_size'  : 10,
    'rnn_layers'   : 5,
    'rnn_neurons'  : 64,
    'dnn_layers'   : 3,
    'dnn_neurons'  : 64,
    'learning_rate': 0.001})

model_name = name_model('LSTM_UP_KFOLD', params)
model_folder = f'./models/LSTM_UP_KFOLD_5_64_3_64_Adam_0.001_None_None_None_10_(11528, 21, 20)'
model_path = f'{model_folder}/1.h5'

# D:\OneDrive\Github\glycosylation\o-linked-site-prediction-feature-augment\models\

model = LSTM_CLS(train_x_shape[1], train_x_shape[-1], train_y_shape[-1], params)
model.load_weights(model_path)

In [7]:
from functions import *

for_onehot = { # column_name : classes
    # for input variables
    'residue' : ['A', 'R', 'N', 'D', 'C',
                 'E', 'Q', 'G', 'H', 'I',
                 'L', 'K', 'M', 'F', 'P',
                 'S', 'T', 'W', 'Y', 'V'],
    
    # for output variables
    'positivity' : [0, 1]
}

x_cts = []
x_cat = ['residue']
x_var = x_cts + x_cat

y_cts = []
y_cat = ['positivity']
y_var = y_cts + y_cat

In [10]:
# Main window configuration
win = tk.Tk()
win.geometry('1350x800')
win.title('O-GlcNAcylation predictor (MIT)')
win.option_add('*Font', 'arial 24')

x_unit = 5
PROTEIN_ID = tk.StringVar()
SEQUENCE_LEN = tk.IntVar() # holds an integer value of protein sequence length
SEQUENCE     = tk.StringVar()

# functions
def get_sequence(protein_id): # return protein sequence from UniProt database using input ID
    url = f"https://rest.uniprot.org/uniprotkb/{protein_id}.fasta"
    req = requests.get(url).text
    if req.split(' ')[0] == 'Error':
        return ''
    else:
        return ''.join(req.split(' ')[-1].split('\n')[1:-1])
    
def get_sites(protein_id): # return protein sequence from UniProt database using input ID
    url = f"https://www.oglcnac.mcw.edu/search/?query_protein={protein_id}"
    req = requests.get(url, verify=False).text.split('\n')

    if req[4] != '<!-- BEGIN -->':
        return []
    else:
        sites_info = [x for x in req if 'GlcNAc Sites' in x][0]
        info_idx = sites_info.index('<br>')
        sites = sites_info[info_idx + 4 :].split(', ')
        if sites == []:
            return []
        else:
            return sites
        
def display_sequence(sequence):
    if len(sequence) > 40:
        return f'{len(sequence)} AAs\n\n' + f'{sequence[:10]}' + f'\n{sequence[10:20]}' + \
                    '\n...\n' + f'\n{sequence[-20:-10]}' + f'\n{sequence[-10:]}'
    else:
        return f'{len(sequence)} AAs\n\n' + ''.join([sequence[x:x+10] + '\n' for x in range(0, 40, 10)]) 
    
def print_sequence(sequence):
    if len(sequence) > 30:
        return f'{sequence[:15]}...{sequence[-15:]}'
    else:
        return sequence

def set_to_default():
    lab_idd.config(text = default_idd)
    lab_sequence.config(text = defalut_seq)
    can_prob.delete('all')
    can_prob_t.delete('all')
    SEQUENCE.set('')
    SEQUENCE_LEN.set('')
    PROTEIN_ID.set('')


### main body ###
## Menu configuration ##
menu_font = 10
menu = tk.Menu(win, font = menu_font)
win.config(menu=menu)

# File menu
filemenu = tk.Menu(menu, font = menu_font)
menu.add_cascade(label = 'File', menu=filemenu)
def newfile():
    print('new file!')
filemenu.add_command(label = 'New', command = newfile)
def openfile():
    print('open file!')
filemenu.add_command(label = 'Open', command= openfile)

# Help menu
helpmenu = tk.Menu(menu, font = menu_font)
menu.add_cascade(label = 'Help', menu=helpmenu)



## row 0 ##
row = 0
y_unit = 3
# Label for protein ID
lab = tk.Label(win, text = 'Protein ID', bg='#3B3838', fg='white', width = 2*x_unit, height = y_unit)
lab.grid(row = row, column = 0, padx = 5, pady = 5)

# Entry for input protein ID
ent_id = tk.Entry(win, width = 3*x_unit)
default_id = 'O88935'
ent_id.config(justify='center')
ent_id.insert(0, default_id)
ent_id.config(fg='gray')

def clear_ent_id(event):
    if ent_id.get() == default_id:
        ent_id.delete(0, len(ent_id.get()))
        ent_id.config(fg='black')
        
ent_id.bind("<Button-1>", clear_ent_id)

def refill_ent_id(event):
    if ent_id.get() == '':
        ent_id.insert(0, default_id)
        ent_id.config(fg='gray')
        
ent_id.bind("<FocusOut>", refill_ent_id)

    
ent_id.grid(row = row, column = 1, padx = 5, pady = 5)

# Button for get protein sequence from ID
btn_seq = tk.Button(win, text='get sequence \nfrom UniProt', bg='#F2F2F2', width = 4*x_unit, height = y_unit)

def err_id(sequence):
    messagebox.showerror("Error", f'"{sequence}" does not exist in UniProt database')
    lab_sequence.config(text = defalut_seq)

def cmd_btn_id():
    protein_id = ent_id.get()
    SEQUENCE.set(get_sequence(protein_id))
    
    sequence = SEQUENCE.get()
    SEQUENCE_LEN.set(len(sequence)) 
    if sequence == '':
        err_id(protein_id)
        set_to_default()
        
    else:
        PROTEIN_ID.set(protein_id)
        lab_sequence.config(text =  f'{len(sequence)} AAs' + f'\n\n{sequence[:10]}' + f'\n{sequence[10:20]}' + \
                    '\n...\n' + \
                    f'\n{sequence[-20:-10]}' + f'\n{sequence[-10:]}')
        lab_idd.config(text = PROTEIN_ID.get())
        can_prob.delete('all')
        can_prob_t.delete('all')
        can_data.delete('all')
        can_data_t.delete('all')
        cmd_can_base()
    
btn_seq.config(command=cmd_btn_id)

btn_seq.grid(row = row, column = 2, padx = 5, pady = 5)

# Label for display target sequence
lab_sequence = tk.Label(win)
defalut_seq = 'Step 1.\nEnter protein ID or\ncustom protein sequence \n\nStep 2.\nClick "Calculate" button\nto see the result'
lab_sequence.config(text = defalut_seq, bg='#FFF2CC',
                    width = 4*x_unit, height = 12)
lab_sequence.grid(row = 0, column = 3, rowspan=2, columnspan=2, padx = 1, pady = 1)



## row 1 ##
row = 1
y_unit = 8
# Label for custom sequence
lab_cus = tk.Label(win, text = 'Custom\nsequence', bg='#3B3838', fg='white', width = 2*x_unit, height = y_unit)
lab_cus.grid(row = row, column = 0, padx = 5, pady = 5)

# Entry for input custom sequence
text_cus = tk.Text(win)
default_text = 'Enter a sequence\n\ne.g.\nMTLPHSPGSAGEPQASQTVQ...'
text_cus.config(width = 15, height=y_unit)
text_cus.insert('1.0', default_text)
text_cus.config(fg='gray')

def clear_text_seq(event):
    if text_cus.get('1.0', 'end-1c') == default_text:
        text_cus.delete('1.0', tk.END)
        text_cus.config(fg='black')
        
text_cus.bind("<Button-1>", clear_text_seq)

def refill_text_seq(event):
    if text_cus.get('1.0', 'end') == '\n':
        text_cus.insert(1.0, default_text)
        text_cus.config(fg='gray')
        
text_cus.bind("<FocusOut>", refill_text_seq)

text_cus.grid(row = row, column = 1, padx = 5, pady = 5)

# Button for get the custom sequence
btn_cus = tk.Button(win, text='Get custom sequence', bg='#F2F2F2', width = 4*x_unit, height = y_unit)

def err_cus():
    messagebox.showerror("Error", f'Please enter valid custom sequence')
    
def cmd_btn_cus():
    sequence = text_cus.get('1.0', 'end-1c').upper()
    PROTEIN_ID.set('')
    SEQUENCE.set(sequence)
    SEQUENCE_LEN.set(len(sequence)) 
    
    if all([x not in 'BJOUXZ' for x in sequence]):
        lab_sequence.config(text = display_sequence(sequence))
        lab_idd.config(text = print_sequence(sequence))
        can_prob.delete('all')
        can_prob_t.delete('all')
        can_data.delete('all')
        can_data_t.delete('all')
        cmd_can_base()
    
        
    else:
        err_cus()
        set_to_default()
        
btn_cus.config(command=cmd_btn_cus)

btn_cus.grid(row = row, column = 2, padx = 5, pady = 5)



## row 2 ##
row = 2
y_unit = 1

# Button for calculation
btn_cal = tk.Button(win, text = 'Calculate', bg='#F2F2F2', width = 2*x_unit, height=y_unit)

def err_cal():
    messagebox.showerror("Error", f'No protein information')
    
def cmd_btn_cal():
    sequence = SEQUENCE.get()
    if sequence == '':
        err_cal()
    else:
        seq_df = pd.DataFrame([x for x in sequence], columns=['residue'])
        seq_onehot = get_onehots(seq_df, columns = x_cat)
        data_x = []
        ST_idx = []
        for idx, residue in seq_df.iterrows():
            R = residue.values[0]
            if R == 'S':
                window_x = np.array(get_window(seq_onehot, idx, params['window_size']))
                data_x.append(window_x)
                ST_idx.append(f"S_{idx}")
            elif R == 'T':
                window_x = np.array(get_window(seq_onehot, idx, params['window_size']))
                data_x.append(window_x)
                ST_idx.append(f"T_{idx}")
            else:
                pass
        data_x = np.array(data_x)
        probabilities = model.predict(data_x, verbose=0)[:,1].round(2)
        
        for s, prob in zip(ST_idx, probabilities):
            r_location = int(s.split('_')[1]) / SEQUENCE_LEN.get()
            st = s.split('_')[0]
            re_index = int(s.split('_')[1]) + 1
            s = f'{st}{re_index}'
            if prob > 0.8:
                can_prob.create_line(can_width*r_location, 0, can_width*r_location, can_height, fill='Blue', width=3)
                can_prob_t.create_text(can_width*r_location, can_height/2, text = f'{s}\n({prob:.2f})')
                
            elif prob > 0.5:
                can_prob.create_line(can_width*r_location, 0, can_width*r_location, can_height, fill='Green', width=3)
                can_prob_t.create_text(can_width*r_location, can_height/2, text = f'{s}\n({prob:.2f})')
                
    
btn_cal.config(command = cmd_btn_cal)

btn_cal.grid(row = row, column = 0, rowspan = 2, padx = 5, pady = 5)

# label to display protein ID or sequence
default_idd = 'protein ID or custom sequence will be determined'
lab_idd = tk.Label(win, text = default_idd, justify='left', width = 10*x_unit, height=1, bg = None)
lab_idd.grid(row = row, column = 1, columnspan=3, padx = 5, pady = 0)

# label to show high probability
lab_hp = tk.Label(win, text = '> 0.8', bg='blue', fg='white')
lab_hp.grid(row = 2, column = 4, padx = 0)
lab_mp = tk.Label(win, text = '> 0.5', bg='green', fg='white')
lab_mp.grid(row = 3, column = 4, padx = 0)



## row 3 ##
row = 3
y_unit = 1

# image to display protein sequence with glycoslylation probability
can_width, can_height = 190*x_unit, 40

can_prob = tk.Canvas(win, bg='white')
can_prob.config(width = can_width, height = can_height)
can_prob.grid(row = row, column = 1, columnspan=3, rowspan=1, padx = 5, pady = 0)



## row 4 ##
row = 4
can_prob_t = tk.Canvas(win, bg = None)
can_prob_t.config(width = can_width, height = can_height)
can_prob_t.grid(row = row, column = 1, columnspan=3, rowspan=1, padx = 5, pady = 0)



## row 5 ##
row = 5
y_unit = 1
# Label for connecting O-GlcNAcome database 
lab_glc = tk.Label(win, text = 'GlcNAcome\nDatabase', bg='#3B3838', fg='white', width = 2*x_unit, height = 3)
import webbrowser
def cmd_lab_glc(event):
    protein_id = lab_idd.cget('text')
    webbrowser.open(f'https://www.oglcnac.mcw.edu/search/?query_protein={protein_id}', new = 2)
lab_glc.bind("<Button-1>", cmd_lab_glc)
lab_glc.grid(row = row, column = 0, rowspan = 2, padx = 5, pady = 5)

# Canvas for display O-GlcNacylated sites from the database
can_data = tk.Canvas(win, bg='white')
can_data.config(width = can_width, height = can_height)
can_data.grid(row = row, column = 1, columnspan=3, rowspan=1, padx = 5, pady = 0)



## row 6 ##
row = 6
can_data_t = tk.Canvas(win, bg = None)

def cmd_can_base():
    sites = get_sites(PROTEIN_ID.get())
    
    if sites == []:
        pass    
    else:
        for s in sites:
            st = s[0]
            location = s[1:]
            r_location = int(location) / SEQUENCE_LEN.get()
            can_data.create_line(can_width*r_location, 0, can_width*r_location, can_height, fill='Blue', width=3)
            can_data_t.create_text(can_width*r_location, can_height/2, text = f'{s}')
                
can_data_t.config(width = can_width, height = can_height)
can_data_t.grid(row = row, column = 1, columnspan=3, rowspan=1, padx = 5, pady = 0)


# open the window
win.mainloop()

