## MLP Classifier (Leave-One-Out)

## Import Libraries

In [0]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import random

import pandas as pd
import io
import requests
import warnings

warnings.filterwarnings("ignore")    

## Define Helper Functions

In [0]:
#These are all helper functions originally designed by Dr.Frank, and modified for this project

# Global variable 
NUMBER_CHEMICAL_SHIFT_TYPE = 19

def get_cs_all(cs_all, id = "2KOC"):
  '''    
    This function gets chemical shifts for a particular RNA. 
    Assumes each RNA has a unique id  
  '''
  return(cs_all[(cs_all.id == id)])

def get_cs_residues(cs_i, resid, dummy = 0):
  '''    
    This function return an array containing the chemical shifts for a particular residues in an RNA.    
  '''
  cs_tmp = cs_i[(cs_i.resid == resid)].drop(['id', 'resid', 'resname', 'stacking'], axis=1)
  info_tmp = cs_i[(cs_i.resid == resid)]
  if (cs_tmp.shape[0] != 1):
     return(dummy*np.ones(shape=(1, NUMBER_CHEMICAL_SHIFT_TYPE)))
  else:
     return(cs_tmp.values)
    
def get_resnames(cs_i, resid, dummy = "UNK"):
  '''    
    This function returns the residue name for specified residue (resid)
  '''
  cs_tmp = cs_i[(cs_i.resid == resid)]  
  if (cs_tmp.shape[0] != 1):
     return(dummy)
  else:
     return(cs_tmp['resname'].values[0])

def get_cs_features(cs_i, resid, neighbors=1):
  '''    
  This function chemical shifts and resnames for residue (resid) and its neighbors        

  '''
  cs = []
  resnames = []
  for i in range(resid-neighbors, resid+neighbors+1):
    cs.append(get_cs_residues(cs_i, i))
    resnames.append(get_resnames(cs_i, i))
  return(resnames, np.array(cs))

def get_columns_names(neighbors = 3, chemical_shift_types = 19):
  '''
    
    Helper function that writes out the required column names
    
  '''

  columns = ['id', 'resname', 'resid', 'Stacking']
  for i in range(0, neighbors*chemical_shift_types):
    columns.append(i)
  return(columns)

def write_out_resname(neighbors=1):
  '''
  
    Helper function that writes out the column names associated resnames for a given residue and its neighbors
    
  '''  
  colnames = []
  for i in range(1-neighbors-1, neighbors+1):
    if i < 0: 
      colnames.append('R%s'%i)
    elif i > 0: 
      colnames.append('R+%s'%i)
    else: 
      colnames.append('R')
  return(colnames)    


def get_cs_features_rna(cs, neighbors=1, retain = ['id', 'stacking', 'resid']):
  '''    
    This function generates the complete required data frame an RNA    
  '''
  all_features = []
  all_resnames = []
  for resid in cs['resid'].unique():
    resnames, features = get_cs_features(cs, resid, neighbors)
    all_features.append(features.flatten())
    all_resnames.append(resnames)

  all_resnames = pd.DataFrame(all_resnames, dtype='object', columns = write_out_resname(neighbors))
  all_features = pd.DataFrame(all_features, dtype='object')
  info = pd.DataFrame(cs[retain].values, dtype='object', columns = retain)
  return(pd.concat([info, all_resnames, all_features], axis=1))

def get_cs_features_rna_all(cs, neighbors = 2):  
  '''    
    This [should] function generate a pandas dataframe containing training data for all RNAs
    Each row in the data frame should contain the class and chemical shifts for given residue and neighbors in a given RNA.
    Use the function above to write function
    
  '''  
  # Start: your code
  ids = cs['id'].unique()
  for i,id in enumerate(ids):
    if i == 0:
      cs_new = get_cs_features_rna(get_cs_all(cs, id), neighbors)
    else:
      cs_new = cs_new.append(get_cs_features_rna(get_cs_all(cs, id), neighbors), sort = False)        
  # End: your code
  return(cs_new)

def create_training_testing(cs, leave_out = "2KOC", target_name = 'stacking', neighbors = 2, drop_names = ['id', 'stacking', 'resid']):
  '''    
    This function creates a training and testing set using leave one out    
  '''
  
  # drop extraneous data  
  drop_names = drop_names + list(write_out_resname(neighbors))  
  
  # does not contain leave_out
  train = cs[(cs.id != leave_out)]
  trainX = train.drop(drop_names, axis=1)
  trainy = train[target_name]
 
  # only contains leave_out
  test = cs[(cs.id == leave_out)]
  testX = test.drop(drop_names, axis=1)
  testy = test[target_name]
  
  # return training and testing data
  return(trainX.values, trainy.values, testX.values, testy.values)

## Define unique_rnas list and create preliminary training data and prepare testing data

In [0]:
url="https://drive.google.com/uc?id=1e-SHtWDtg4mD_th3_4Jmq9r1iiQC32wT"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')), sep = ' ')
my_list = c['id'].tolist()
#Create list of the unique RNAS in the given dataset
unique_rnas = []
for values in my_list:
  if values not in unique_rnas:
    unique_rnas.append(values)

#stack (1) and non-stack (0)
#drop columns with extraneous information
c = c.drop(columns = ['Unnamed: 0','base_pairing','orientation','sugar_puckering','pseudoknot'])

c = c.replace('stack',1).replace('non-stack',0)

In [0]:
unique_rnas

['1A60',
 '1HWQ',
 '1JO7',
 '1KKA',
 '1L1W',
 '1LC6',
 '1LDZ',
 '1MFY',
 '1NA2',
 '1NC0',
 '1OW9',
 '1PJY',
 '1Q75',
 '1R7W',
 '1R7Z',
 '1SCL',
 '1SY4',
 '1SYZ',
 '1UUU',
 '1XHP',
 '1YMO',
 '1YSV',
 '1Z2J',
 '1Z30',
 '1ZC5',
 '28SP',
 '28SR',
 '2F87',
 '2FDT',
 '2GVO',
 '2JR4',
 '2JWV',
 '2JYM',
 '2K66',
 '2KEZ',
 '2KF0',
 '2KOC',
 '2KXM',
 '2KZL',
 '2L3E',
 '2L5Z',
 '2L6I',
 '2L8H',
 '2LAC',
 '2LBJ',
 '2LBK',
 '2LBL',
 '2LDL',
 '2LDT',
 '2LHP',
 '2LI4',
 '2LJJ',
 '2LK3',
 '2LP9',
 '2LPA',
 '2LPS',
 '2LQZ',
 '2LUB',
 '2LUN',
 '2LV0',
 '2M12',
 '2M21',
 '2M22',
 '2M4W',
 '2M5U',
 '2M8K',
 '2MEQ',
 '2MFD',
 '2MHI',
 '2MIS',
 '2MNC',
 '2MTJ',
 '2MXL',
 '2N2O',
 '2N2P',
 '2N3Q',
 '2N3R',
 '2N4L',
 '2N6S',
 '2N6T',
 '2N6W',
 '2N6X',
 '2NBY',
 '2NBZ',
 '2NC0',
 '2NC1',
 '2NCI',
 '2O33',
 '2QH2',
 '2QH3',
 '2QH4',
 '2RVO',
 '2Y95',
 '4A4S',
 '4A4T',
 '4A4U',
 '5A17',
 '5A18',
 '5IEM',
 '5KQE',
 '5UF3',
 '5UZT',
 '5V16',
 '5WQ1']

## Define ML Algorithm

In [0]:
def MLnetwork(rnaid, n, cs_all, shift):  
  # load initial data
  
  id = rnaid
  trainX, trainy, testX, testy = create_training_testing(cs_all, leave_out = id, neighbors = n)

  # setup scaler
  scaler = StandardScaler()
  scaler.fit(trainX)

  # transform input
  trainX_scaled = scaler.transform(trainX)
  testX_scaled = scaler.transform(testX)
  
  clf = MLPClassifier(hidden_layer_sizes=(50,100,50), max_iter=100)
  clf.fit(trainX_scaled, np.int_(trainy))
  y_true, y_pred = np.int_(testy) , clf.predict(testX_scaled)
  report = classification_report(y_true, y_pred, output_dict=True)

  #Putting Classification Report in pandas, and keeping f1 score. I am using f1 score to represent the accuracy of the model as it is more complete than just the accuracy
  
  df = pd.pandas.DataFrame(report).transpose()
  df = df.drop([df.index[0], df.index[1], df.index[2], df.index[3]])
  df = df.rename(index={"weighted avg": f"{rnaid} weighted average: {n} neighbors"})
  df = df.drop(columns=['precision', 'recall'])
  return df

## Test ML

In [0]:
NeighborNumberResults = pd.DataFrame(columns=['Number of Neighbors', 'Weighted Average F1 score'])

#Loop over neighbors=0
for k in [0,1,2,3,4,5]:
  #Put data in a dataframe that maps the f1-score and support for every L-O-O RNA for each neighbor
  df_ = pd.DataFrame(columns = ['f1-score','support'])
  df_ = df_.fillna(0)
  
  #Create list of all rnas, and loop ML over all the RNAS in 'unique_rnas' for a defined k by the main loop
  cs_all = get_cs_features_rna_all(c, k)
  for value in unique_rnas:
    df_ = df_.append(MLnetwork (value, k, cs_all, ''))
  weighted_average = 0
  
  #Extract weighted average for each RNA (weighted by the number of residues it has)
  #weighted_average = (df_['f1-score']*df_['support']).sum()/df_['support'].sum()
  weighted_average = (df_['f1-score']*df_['support']).sum()/df_['support'].sum()
  print(f"The weighted average for {k} neighbors with support factored in is {weighted_average}")
  
  #Put data in a new dataframe for every weighted average result and the number neighbor
  temptuple = [(k, weighted_average)]
  tempframe = pd.DataFrame(temptuple, columns=['Number of Neighbors', 'Weighted Average F1 score'])
  NeighborNumberResults = NeighborNumberResults.append(tempframe) 

df_

The weighted average for 0 neighbors with support factored in is 0.8326797696398074
The weighted average for 1 neighbors with support factored in is 0.8291432955404512
The weighted average for 2 neighbors with support factored in is 0.8306033272847091
The weighted average for 3 neighbors with support factored in is 0.8318402812925615
The weighted average for 4 neighbors with support factored in is 0.83662772702664
The weighted average for 5 neighbors with support factored in is 0.8272061998272054


Unnamed: 0,f1-score,support
1A60 weighted average: 5 neighbors,0.717647,44.0
1HWQ weighted average: 5 neighbors,0.770370,30.0
1JO7 weighted average: 5 neighbors,0.733943,31.0
1KKA weighted average: 5 neighbors,0.796964,17.0
1LC6 weighted average: 5 neighbors,0.833333,24.0
...,...,...
5KQE weighted average: 5 neighbors,0.834554,36.0
5UF3 weighted average: 5 neighbors,0.820831,23.0
5UZT weighted average: 5 neighbors,0.929711,31.0
5V16 weighted average: 5 neighbors,0.938561,41.0


In [0]:
#This data is from the most recent trial in the paper. This program was run 2 more times to get a triplicate mean and standard deviation

NeighborNumberResults

Unnamed: 0,Number of Neighbors,Weighted Average F1 score
0,0,0.83268
0,1,0.829143
0,2,0.830603
0,3,0.83184
0,4,0.836628
0,5,0.827206
