#Environment Set-Up


#### Prepare Google Drive Mount + Modules

In [None]:
# If you get "Transport not found" error: 
# STEP 1: Reset runtime (i.e. terminate session)
# STEP 2: Run the following code 
!fusermount -u drive
!google-drive-ocamlfuse drive

fusermount: failed to unmount /content/drive: No such file or directory
/bin/bash: google-drive-ocamlfuse: command not found


In [None]:
from google.colab import drive
import os
drive.mount('/content/drive', force_remount=True)
my_drive_path = '/content/drive/MyDrive/'

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing, datasets    
from sklearn import metrics
import math

Mounted at /content/drive


# Data Preparation 


#### Create Pfeature and 2-gram features

In [None]:
def read_data(file, testing_bool):
  # df = pd.read_csv(shared_folder_path + 'sequences_training.txt', header=None)
  df = pd.read_csv('/content/drive/MyDrive/rams/fall 2022/CMSC 435/CMSC 435 Project/' + file, header=None)
  if (testing_bool == 0):
    df.columns = ['sequence', 'label']
  elif (testing_bool == 1): 
    df.columns = ['sequence']
  return df

# FEATURE: Pfeature -----------------------------------------------------------
def prep_for_pfeature(df, testing_bool):
  # python setup.py install --user
  if (testing_bool == 0): 
    df = df.drop(columns=['label'])
  print("INPUT DF SIZE: ", df.shape)
  df.to_csv(r'protein2.seq', header=None, index=None, sep='\t', mode='a')

  # https://wiki.thegpm.org/wiki/Amino_acid_symbols
  fh = open('protein2.seq','r').read()
  fh = fh.replace('Z', 'E')  # Glutamic acid (E) or Glutamine (Q)
  fh = fh.replace('X', '')  # Unknown amino acid
  fh = fh.replace('U', '')  # Unusual translation
  # unique_chars = set(fh)
  # print("LEN", sorted(unique_chars))

  text_file = open("protein2.seq", "w")
  n = text_file.write(fh)
  text_file.close()

#HERE SHAHAD
def create_pfeature(read_file, input_file, output_file, testing_bool): 

  # Remove previous files
  import os
  if os.path.exists(input_file):
    os.remove(input_file)

  if os.path.exists(output_file):
    os.remove(output_file)

  df2 = read_data(read_file, testing_bool)
  %cd "/content/drive/MyDrive/rams/fall 2022/CMSC 435/CMSC 435 Project/pfeature_standalone"
  prep_for_pfeature(df2, testing_bool)

  import subprocess
  os.system("python pfeature_comp.py -i " + input_file + " -o " + output_file + " -j CTC")
  # !python pfeature_comp.py -i protein2.seq -o comp.txt -j CTC
  comp_df = pd.read_csv("comp.txt")
  print("COMP DF SIZE", comp_df.shape)

  # os.system("python pfeature_bin.py -i " + input_file + " -o " + output_file + " -j CTC")
  # bin_df = pd.read_csv("bin.txt")
  # print(bin_df)
  # df2 = pd.concat([df2, comp_df, bin_df], axis=1)
  df2 = pd.concat([df2, comp_df], axis=1)

  print("FINAL DF SIZE", df2.shape)

  return df2

# FEATURE: 2 gram -----------------------------------------------------------
def two_or_three_gram(df_row, num_of_gram):
  arr = np.array(['V', 'Y', 'T', 'G', 'D', 'C', 'K', 'S', 'H', 'E', 'F', 'Q', 'R', 'A', 'N', 'P', 'I', 'W', 'L', 'M'])
  # storing the pairs at the appropriate key, will take the length of the value (array) of each key to get the count
  if num_of_gram == 2:
    list_of_potential_pairs = []
    # print(len(arr), len(df_row))
    for i in range(len(arr)):
      for j in range(len(arr)):
        list_of_potential_pairs.append([arr[i], arr[j]])

    # storing the potential pairs as keys in a dict, storing the values as 0 for now
    dict_of_pairs = {}
    for pairs in list_of_potential_pairs:
      dict_of_pairs[f"{pairs}"] = []   

    for i in range(len(df_row)-1):
        pair = f"{[df_row[i], df_row[i+1]]}"
        if pair in dict_of_pairs:
          dict_of_pairs[pair].append(pair)
  
  if num_of_gram == 3: 
    list_of_potential_pairs = []
    for i in range(len(arr)):
      for j in range(len(arr)):
        for k in range(len(arr)):
          list_of_potential_pairs.append([arr[i], arr[j], arr[k]])

    # storing the potential pairs as keys in a dict, storing the values as 0 for now
    dict_of_pairs = {}
    for pairs in list_of_potential_pairs:
      dict_of_pairs[f"{pairs}"] = []   

    for i in range(len(df_row)-2):
      pair = f"{[df_row[i], df_row[i+1], df_row[i+2]]}"
      if pair in dict_of_pairs:
          dict_of_pairs[pair].append(pair)

  for key, value in dict_of_pairs.items():
    count = len(value)
    dict_of_pairs[key] = count

  return list(dict_of_pairs.values())

# FEATURE: Creates unique_val, unique_val_count, aa_count_freq -----------------------------------------------------------
def create_count_features(df):

  sequence_length = df['sequence'].str.len() # amino acid sequence length
  df = df.assign(length = sequence_length)

  arr = df['sequence'].to_list()
  unique_values = []
  count = []
  aa_count_freq = []
  all_aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

  for a in arr:

      # Get unique amino acids
      unique_val_set = list(set(a))
      unique_values.append(sorted(unique_val_set))
      
      # Count of unique amino acids in each sequence
      count.append(len(unique_val_set))

      aa_count_totals = []

      for aa in all_aa:
       aa_count_totals.append(round(a.count(aa)/len(a),2))
      aa_count_freq.append(aa_count_totals)

  df = df.assign(unique_val = unique_values)
  df = df.assign(unique_val_count = count)
  df = df.assign(aa_count_freq = aa_count_freq)

  return df

# FEATURE: Create encoder object and fit array -----------------------------------------------------------
label_encoder = LabelEncoder()

# Parse through and encode each individual sequence 
def ordinal_encoder(my_array):
   label_encoder.fit(my_array)
   integer_encoded = label_encoder.transform(my_array)
   float_encoded = integer_encoded.astype(float)
   j = 0
   for i in range(0,20):
      j += .05
      float_encoded[float_encoded == i] = round(j, 2)
   return float_encoded

# FEATURE: Convert amino acid genome df column into encoded array
def create_encoded_genome_col(df):

  # Convert amino acid genome column into array
  arr = df['sequence'].to_list()

  feature_vectors = []
  for a in arr:
      # Get unique amino acids and count of unique amino acids in each sequence
      unique_values = set(a)
      count = len(unique_values)

      # Encode sequence
      encoding = ordinal_encoder(list(a))
      feature_vectors.append(encoding)

  # Create new column of encoded sequences
  df = df.assign(encoded = feature_vectors)

  return df

In [None]:
df2 = create_pfeature("sequences_training.txt", "protein2.seq", "comp.txt", 0)
df2 = create_count_features(df2)
df2 = create_encoded_genome_col(df2)

/content/drive/.shortcut-targets-by-id/1fuA8qot-MwgQ6H4rA5nuxA3l8Za7jqqS/rams/fall 2022/CMSC 435/CMSC 435 Project/pfeature_standalone
INPUT DF SIZE:  (8795, 1)
COMP DF SIZE (26385, 343)
FINAL DF SIZE (26385, 345)


In [None]:
# Adding 2 gram features to the dataset -------------------------------------------------

# assigning two gram features to the dataset, but grouped
df2 = df2.astype(object)
df2.assign(two_gram_features = np.zeros([len(df2),400]))

for i, row in enumerate(df2.sequence.to_numpy()):
  whole_pair_list = two_or_three_gram(row,2) 
  divide_pair_list_into_groups = np.array(whole_pair_list).reshape(20,20)
  list_of_counts = []
  for group in divide_pair_list_into_groups:
    # trying to find better ways to group these
    # 1 way: take the sum of one group and make it that value, but scale it to be between 0 and 1
    sum = np.sum(group) 
    sum = round(sum * .01, 2)
    list_of_counts.append(sum) 

  # 2nd way: take the max value of that group, and add the index to it (index of the max has importance to it, adding is one way to show it)
  #   group = np.array(group)
  #   max = np.max(group)
  #   index = group.argmax() 
  #   max_plus_index = round(max * .01, 2) + index
  #   list_of_counts.append(max_plus_index) 
  df2.at[i,"two_gram_features"] = list_of_counts

TypeError: ignored

In [None]:
print("DF COLUMNS: ", df2.columns)
print("DF SHAPE: ", df2.shape)

# Model Implementation

### Create feature vector + training dataset

In [None]:
from joblib.logger import print_function
from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.utils.validation import check_array

def prep_for_model(df2, testing_bool): 
  y = 0

  print("FINAL COLUMNS PRE", df2.columns)

  # Drop features we're not looking at  
  if (testing_bool == 0):
    df_final = df2.drop(columns=['label', 'sequence','unique_val', 'unique_val_count', 'encoded'])       
    labels = df2['label'].to_numpy()
    y = labels
  elif (testing_bool == 1): 
    df_final = df2.drop(columns=['sequence','unique_val', 'unique_val_count', 'encoded'])

  print("FINAL COLUMNS POST", df_final.columns)
  print("AA COLUMNS", len(df_final.aa_count_freq[0]))

  # Combining all the features we want to use and adding it to new feature vector column
  features = df_final.to_numpy()
  feature_vector = list()
  for i in range(len(features)):
    np.set_printoptions(suppress=True)
    flattened_features = np.hstack(np.array(features[i]))
    feature_vector.append(flattened_features)
  df_final['feature_vector'] = feature_vector

  # features and lables will be used to run the model
  features = df_final['feature_vector'].to_numpy()

  # Numpy is weird, few workarounds to unpack this 2d array correctly
  X = np.asarray(features).reshape(-1,1)
  X = np.array([x[0] for x in X])

  # Resampling data (only the classes with the least data)
  sm = SMOTE(random_state=42, sampling_strategy={"DRNA":200, "RNA": 750, "DNA":550})

  # Feature Scaling for input features (if we want to scale features further down the line)
  # scaler = preprocessing.StandardScaler()
  # x_scaled = scaler.fit_transform(x)

  # Create training CSV file (501 columns appraoch)
  columns = ['sequence_length']
  feature_vector_length = len(df_final.feature_vector[0])
  print("FEATURE VECTOR LENGTH: ", feature_vector_length)
  for i in range(0,feature_vector_length - 1):
    temp = 'aa_' + str(i)
    columns.append(temp)
  # columns.append('label')

  if (testing_bool == 0):
    final_training = pd.DataFrame(list(df_final['feature_vector']), columns=columns)
    display(final_training)
    final_training['class'] = labels
    # Output CSV
    display(final_training.head(20))
    # final_training.to_csv("training2.csv")


  return X, y, sm

X, y, sm = prep_for_model(df2, 0)

### Cross Validation + Model Implementation

In [None]:
# classifier = MultinomialNB(alpha=0.1)

# from sklearn.neighbors import KNeighborsClassifier
# classifier = KNeighborsClassifier(n_neighbors=3)

print("Number of feature before k select: ", len(X[0]))

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

from sklearn.feature_selection import SelectFromModel
selector = SelectFromModel(estimator=RandomForestClassifier()).fit(X, y)
X = selector.transform(X)

print("Number of feature after k select: ", len(X[0]))



Number of feature before k select:  384
Number of feature after k select:  163


In [None]:
# CROSS VALIDATION AND MODEL IMPLEMENTATION

lst_perf_metrics = []
final_y_prediction = list() # create empty list
final_y_expected = list()

# See more info on the output 5 fold validation data here: https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for train_index, test_index in skf.split(X, y):

  # Features for training and testing
  X_train_fold, X_test_fold = X[train_index], X[test_index]

  # Class labels for training and testing
  y_train_fold, y_test_fold = y[train_index], y[test_index]

  X_train_fold = X_train_fold.tolist()
  y_train_fold = y_train_fold.tolist()
  X_test_fold = X_test_fold.tolist()
  y_test_fold = y_test_fold.tolist()

  # Resampling in the cross validation
  X_train_fold, y_train_fold = sm.fit_resample(X_train_fold, y_train_fold)

  classifier.fit(X_train_fold, y_train_fold)
  curr_y_prediction = classifier.predict(X_test_fold) 

  final_y_prediction.append(list(curr_y_prediction))
  final_y_expected.append(list(y_test_fold))

flattened_array_pred = [item for sublist in final_y_prediction for item in sublist]
flattened_array_exp = [item for sublist in final_y_expected for item in sublist]

### Performance Metrics Method

In [None]:
# PERFORMANCE METRICS CALCULATION
def confusion_metrics(TP, TN, FP, FN, label_name):
    
    # calculate accuracy
    conf_accuracy = 100 * (float (TP+TN)) / (float(TP + FP + TN + FN))
    # calculate the sensitivity
    conf_sensitivity = 100 * (TP / float(TP + FN))
    # calculate the specificity
    conf_specificity = 100 * (TN / float(TN + FP))
    # calculate mcc
    conf_mcc = float( (TP*TN) - (FP*FN) ) / math.sqrt(float((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)))

    print(label_name, '-'*50)
    print(f'Sensitivity: {round(conf_sensitivity,1)}') 
    print(f'Specificity: {round(conf_specificity,1)}') 
    print(f'Accuracy: {round(conf_accuracy,1)}') 
    #print(f'Precision: {round(conf_precision,2)}')
    print(f'MCC: {round(conf_mcc,3)}',"\n")

    return conf_mcc

def perf_metrics(y_prediction, y_test_fold):

  # print(classification_report(y_test_fold, y_prediction), "\n")

  total_num_proteins = len(y_prediction)

  # Get all unique sense tags
  indices = list(set(df2['label']))
  
  # Using sklearn.metrics, create a confusion matrix
  cm = metrics.confusion_matrix(y_test_fold, y_prediction, labels = indices)

  cm_df = pd.DataFrame(cm,
              index = ['A: ' + i for i in indices], 
              columns = ['P: ' + i for i in indices])

  print(cm_df, '\n')

  mcc = 0

  # DNA - Shahad
  DNA_TP = cm_df["P: DNA"]["A: DNA"]
  DNA_TN = cm_df["P: nonDRNA"]["A: nonDRNA"] + cm_df["P: DRNA"]["A: nonDRNA"] + cm_df["P: RNA"]["A: nonDRNA"] \
        + cm_df["P: nonDRNA"]["A: DRNA"] + cm_df["P: nonDRNA"]["A: RNA"] + cm_df["P: DRNA"]["A: DRNA"] \
      + cm_df["P: DRNA"]["A: RNA"] + cm_df["P: RNA"]["A: DRNA"] + cm_df["P: RNA"]["A: RNA"] 
  DNA_FP = cm_df["P: DNA"]["A: nonDRNA"] + cm_df["P: DNA"]["A: DRNA"] + cm_df["P: DNA"]["A: RNA"] 
  DNA_FN = cm_df["P: nonDRNA"]["A: DNA"] + cm_df["P: DRNA"]["A: DNA"] + cm_df["P: RNA"]["A: DNA"]
  mcc += confusion_metrics(DNA_TP, DNA_TN, DNA_FP, DNA_FN, "DNA")

  # RNA - Ahmad 
  RNA_TP = cm_df["P: RNA"]["A: RNA"]
  RNA_TN = cm_df["P: nonDRNA"]["A: nonDRNA"] + cm_df["P: DRNA"]["A: nonDRNA"] + cm_df["P: DNA"]["A: nonDRNA"] \
    + cm_df["P: nonDRNA"]["A: DRNA"] + cm_df["P: nonDRNA"]["A: DNA"] + cm_df["P: DRNA"]["A: DRNA"] \
    + cm_df["P: DRNA"]["A: DNA"] + cm_df["P: DNA"]["A: DRNA"] + cm_df["P: DNA"]["A: DNA"] 
  RNA_FP = cm_df["P: RNA"]["A: nonDRNA"] + cm_df["P: RNA"]["A: DRNA"] + cm_df["P: RNA"]["A: DNA"] 
  RNA_FN = cm_df["P: nonDRNA"]["A: RNA"] + cm_df["P: DRNA"]["A: RNA"] + cm_df["P: DNA"]["A: RNA"]
  mcc += confusion_metrics(RNA_TP, RNA_TN, RNA_FP, RNA_FN, "RNA")

  # DRNA - Ahmad 
  DRNA_TP = cm_df["P: DRNA"]["A: DRNA"]
  DRNA_TN = cm_df["P: nonDRNA"]["A: nonDRNA"] + cm_df["P: RNA"]["A: nonDRNA"] + cm_df["P: nonDRNA"]["A: RNA"] \
    + cm_df["P: RNA"]["A: RNA"] + cm_df["P: nonDRNA"]["A: DNA"] + cm_df["P: RNA"]["A: DNA"] \
    + cm_df["P: DNA"]["A: nonDRNA"] + cm_df["P: DNA"]["A: RNA"] + cm_df["P: DNA"]["A: DNA"]
  DRNA_FP = cm_df["P: DRNA"]["A: nonDRNA"] + cm_df["P: DRNA"]["A: RNA"] + cm_df["P: DRNA"]["A: DNA"]
  DRNA_FN = cm_df["P: nonDRNA"]["A: DRNA"] + cm_df["P: RNA"]["A: DRNA"] + cm_df["P: DNA"]["A: DRNA"]
  mcc += confusion_metrics(DRNA_TP, DRNA_TN, DRNA_FP, DRNA_FN, "DRNA")
  
  #nonDRNA - Tara
  nonDRNA_TP = cm_df["P: nonDRNA"]["A: nonDRNA"]
  nonDRNA_TN = cm_df["P: RNA"]["A: RNA"] + cm_df["P: RNA"]["A: DRNA"] + cm_df["P: RNA"]["A: DNA"] \
    + cm_df["P: DRNA"]["A: DRNA"] + cm_df["P: DRNA"]["A: RNA"] + cm_df["P: DRNA"]["A: DNA"]  \
    + cm_df["P: DNA"]["A: DNA"] + cm_df["P: DNA"]["A: DRNA"] + cm_df["P: DNA"]["A: RNA"]
  nonDRNA_FN = cm_df["P: RNA"]["A: nonDRNA"] + cm_df["P: DNA"]["A: nonDRNA"] + cm_df["P: DRNA"]["A: nonDRNA"]
  nonDRNA_FP = cm_df["P: nonDRNA"]["A: DNA"] + cm_df["P: nonDRNA"]["A: RNA"] + cm_df["P: nonDRNA"]["A: DRNA"]
  mcc += confusion_metrics(nonDRNA_TP, nonDRNA_TN, nonDRNA_FP, nonDRNA_FN, "nonDRNA")

  average_mcc = round(mcc/4,3)
  accuracy4labels = round(100*(DNA_TP + RNA_TP + DRNA_TP + nonDRNA_TP)/ total_num_proteins,1)

  print("\nAverage MCC: ", average_mcc)
  print("\nAccuracy4Labels: ", accuracy4labels)

perf_metrics(flattened_array_pred, flattened_array_exp)

            P: RNA  P: nonDRNA  P: DNA  P: DRNA
A: RNA         164         359       0        0
A: nonDRNA      25        7830       4        0
A: DNA          12         362      16        1
A: DRNA          0          20       0        2 

DNA --------------------------------------------------
Sensitivity: 4.1
Specificity: 100.0
Accuracy: 95.7
MCC: 0.175 

RNA --------------------------------------------------
Sensitivity: 31.4
Specificity: 99.6
Accuracy: 95.5
MCC: 0.489 

DRNA --------------------------------------------------
Sensitivity: 9.1
Specificity: 100.0
Accuracy: 99.8
MCC: 0.246 

nonDRNA --------------------------------------------------
Sensitivity: 99.6
Specificity: 20.8
Accuracy: 91.2
MCC: 0.401 


Average MCC:  0.328

Accuracy4Labels:  91.1


### Test Set

In [None]:
%cd "/content/drive/MyDrive/rams/fall 2022/CMSC 435/CMSC 435 Project"
df_test = create_pfeature("fake_test.txt", "protein_test.seq", "comp_test.txt", 1)
df_test = create_count_features(df_test)
df_test = create_encoded_genome_col(df_test)

/content/drive/.shortcut-targets-by-id/1fuA8qot-MwgQ6H4rA5nuxA3l8Za7jqqS/rams/fall 2022/CMSC 435/CMSC 435 Project
/content/drive/.shortcut-targets-by-id/1fuA8qot-MwgQ6H4rA5nuxA3l8Za7jqqS/rams/fall 2022/CMSC 435/CMSC 435 Project/pfeature_standalone
INPUT DF SIZE:  (8795, 1)
COMP DF SIZE (8795, 343)
FINAL DF SIZE (8795, 344)


In [None]:
# Create n gram features -------------------------------------------------------------------
df_test = df_test.astype(object)
df_test.assign(two_gram_features = np.zeros([len(df_test),400]))

for i, row in enumerate(df_test.sequence.to_numpy()):
  whole_pair_list = two_or_three_gram(row,2) 
  divide_pair_list_into_groups = np.array(whole_pair_list).reshape(20,20)
  list_of_counts = []
  for group in divide_pair_list_into_groups:
    # trying to find better ways to group these
    # 1 way: take the sum of one group and make it that value, but scale it to be between 0 and 1
    sum = np.sum(group) 
    sum = round(sum * .01, 2)
    list_of_counts.append(sum) 

  # 2nd way: take the max value of that group, and add the index to it (index of the max has importance to it, adding is one way to show it)
  #   group = np.array(group)
  #   max = np.max(group)
  #   index = group.argmax() 
  #   max_plus_index = round(max * .01, 2) + index
  #   list_of_counts.append(max_plus_index) 
  df_test.at[i,"two_gram_features"] = list_of_counts

print("DF TEST SHAPE ", df_test.shape)
print("DF TEST COLUMNS", df_test.columns)

DF TEST SHAPE  (8795, 350)
DF TEST COLUMNS Index(['sequence', 'CTC_111', 'CTC_112', 'CTC_113', 'CTC_114', 'CTC_115',
       'CTC_116', 'CTC_117', 'CTC_121', 'CTC_122',
       ...
       'CTC_774', 'CTC_775', 'CTC_776', 'CTC_777', 'length', 'unique_val',
       'unique_val_count', 'aa_count_freq', 'encoded', 'two_gram_features'],
      dtype='object', length=350)


In [None]:
# Run model
X_test, y_test, sm_test = prep_for_model(df_test, 1)
X_test = selector.transform(X_test)
curr_y_prediction = classifier.predict(X_test) 
df_test['label'] = curr_y_prediction #FIX THAT: 

%cd "/content/drive/MyDrive/rams/fall 2022/CMSC 435/CMSC 435 Project"
df_test.to_csv("test_output.csv")

FINAL COLUMNS PRE Index(['sequence', 'CTC_111', 'CTC_112', 'CTC_113', 'CTC_114', 'CTC_115',
       'CTC_116', 'CTC_117', 'CTC_121', 'CTC_122',
       ...
       'CTC_774', 'CTC_775', 'CTC_776', 'CTC_777', 'length', 'unique_val',
       'unique_val_count', 'aa_count_freq', 'encoded', 'two_gram_features'],
      dtype='object', length=350)
FINAL COLUMNS POST Index(['CTC_111', 'CTC_112', 'CTC_113', 'CTC_114', 'CTC_115', 'CTC_116',
       'CTC_117', 'CTC_121', 'CTC_122', 'CTC_123',
       ...
       'CTC_771', 'CTC_772', 'CTC_773', 'CTC_774', 'CTC_775', 'CTC_776',
       'CTC_777', 'length', 'aa_count_freq', 'two_gram_features'],
      dtype='object', length=346)
AA COLUMNS 20
FEATURE VECTOR LENGTH:  384
['sequence_length', 'aa_0', 'aa_1', 'aa_2', 'aa_3', 'aa_4', 'aa_5', 'aa_6', 'aa_7', 'aa_8', 'aa_9', 'aa_10', 'aa_11', 'aa_12', 'aa_13', 'aa_14', 'aa_15', 'aa_16', 'aa_17', 'aa_18', 'aa_19', 'aa_20', 'aa_21', 'aa_22', 'aa_23', 'aa_24', 'aa_25', 'aa_26', 'aa_27', 'aa_28', 'aa_29', 'aa_30', '