In [None]:
#lets start by importing a bunch of stuff
import tensorflow as tf
import pandas as pd
import numpy as np
import math 

In [None]:
# Downloading and separating data

#explictily setting the types and names
names_data = ['entry','entry_name','protein_name','gene_name','organism','length','sequence',
              'gene_ontology','status','organism_id','keywords']

dtypes_data = {'entry':'str','entry_name':'str','protein_name':'str','gene_name':'str',
               'organism':'str','length':'int','sequence':'str','gene_ontology':'str','status':'str',
               'organism_id':'int','keywords':'str'}


protein_data = pd.read_csv('uniprot-filtered-reviewed_yes.tab',sep='\t',names = names_data,dtype=dtypes_data,skiprows=1)


#Seeing what the data looks like
protein_data.shape

#randomly shuffling dataset 

protein_data = protein_data.sample(frac=1)



In [None]:
#Ignoring protein's with ambigous amino acid codes 

protein_data = protein_data[~protein_data.sequence.str.contains('B')]
protein_data = protein_data[~protein_data.sequence.str.contains('O')]
protein_data = protein_data[~protein_data.sequence.str.contains('J')]
protein_data = protein_data[~protein_data.sequence.str.contains('U')]
protein_data = protein_data[~protein_data.sequence.str.contains('X')]
protein_data = protein_data[~protein_data.sequence.str.contains('Z')]

protein_data.shape


In [4]:
index_data_nan= protein_data.loc[protein_data.gene_ontology.isnull()].index
protein_data = protein_data.drop(index_data_nan)

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

protein_data = protein_data.drop(columns=['gene_name', 'organism_id', 'status', 'protein_name', 'entry_name', 'organism', 'keywords'])

index_data = protein_data.loc[protein_data.length > 500].index
protein_data = protein_data.drop(index_data)


#encoding the sequence to a one-hot encoding scheme 
labels = np.array(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'])


#label encoding - integer encoding 
label_encoder = LabelEncoder()
label_encoder.fit(labels)

# onehot encoder
onehot_encoder = OneHotEncoder(sparse=False, categories = [range(20)], dtype=int)


def one_hot_encoder(my_string):
    """function to turn sequence onehot encoded"""
    my_array = np.array(list(my_string)) # converts string into array
    integer_encoded = label_encoder.transform(my_array)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded

#testing custom onehot_encoder
# test_sequence = protein_data.sequence[0]
# string_to_array(test_sequence)
# one_hot_encoder(test_sequence)

#applying onehot_encoder to entire dataframe(takes 6 minutes)
protein_data['sequence_encoded'] = protein_data['sequence'].apply(one_hot_encoder)


In [6]:
#filtering the length of the sequences, dropping sequences greater than 6000

max_length = int(protein_data.length.max())# 2000

def make_all_same_length(my_array):
    """function to make all the sequence encoded arrays into the same length"""
    if len(my_array) == 500:
        return my_array
    else:
        b= [[0]* 20] * (500 - len(my_array))
        same_length_array = np.concatenate((my_array,b), axis = 0)
        return same_length_array

protein_data['sequence_encoded_same_len'] = protein_data['sequence_encoded'].apply(make_all_same_length)

In [7]:
#Dividing into 80 percent training and 20 percent test
train, test = np.split(protein_data, [int(.8*len(protein_data))])

#removing GO labels and keywords from the test dataset
test = test.drop(columns=['gene_ontology'])

### Don't output the cells below

In [None]:
# example  = str(train.gene_ontology.iloc[3])
# print(example)
# example_array = np.array(example.split(";"))
# list_example = [x.strip() for x in example_array]
# list_example
# built a dictionary of unique GO MF terms , just in training dataset we have 8272 terms GO MF
max_len_GO = 0
dictionary_uniq_GO = {}
for index, row in train.iterrows():
    for x in row.gene_ontology.split(";"):
        x=x[:-12]
        x=x.strip()
        if x in dictionary_uniq_GO:
            dictionary_uniq_GO[x] += 1
        else:
            dictionary_uniq_GO[x] = 1
    else:
        pass
#print(dictionary_uniq_GO)
#getting top 50 terms for GO 


### Begining one hot encoding the GO Terms 

In [8]:
train.head(n=20)

Unnamed: 0,entry,length,sequence,gene_ontology,sequence_encoded,sequence_encoded_same_len
527365,Q5NI92,286,MMKQKTIAKEFSVTGVGLHSGVDVSMTVKPADIDSGIVFRRADLTP...,metal ion binding [GO:0046872]; UDP-3-O-[3-hyd...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
519549,Q8DI53,426,MNIAVIGLSHKTAPVDVREKLSVPEDVRERALQHLCGYAHIQEATI...,glutamyl-tRNA reductase activity [GO:0008883];...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
82351,B1AJ36,316,MAIVSVKLTSNQVDKFLILWRPYLVHNNHNYAKYTFKIDGTLIIIY...,magnesium ion binding [GO:0000287]; RNA bindin...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
80192,B0S048,115,MDIIKTLEDEQLRENKFDFHVGDTVKVDYLIKEGNKERVQVYEGTV...,structural constituent of ribosome [GO:0003735],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
33360,B5R0H7,103,MYAVFQSGGKQHRVSEGQTVRLEKLDIATGETIEFAEVLMIANGEE...,rRNA binding [GO:0019843]; structural constitu...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
254022,Q49Z04,342,MSETTLILAIESSCDETSVSIIENGKNILSNIVLSQIESHKRFGGV...,iron ion binding [GO:0005506]; metalloendopept...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
146157,B7N1S3,310,MIIVTGGAGFIGSNIVKALNDKGITDILVVDNLKDGTKFVNLVDLD...,ADP-glyceromanno-heptose 6-epimerase activity ...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
209638,P0AB52,117,MQKIVIVANGAPYGSESLFNSLRLAIALREQESNLDLRLFLMSDAV...,identical protein binding [GO:0042802],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
65212,D5VGV2,129,MPKTVITPPGTGTPIAPFSPGTLADGIVYVSGTLAFDKDNNVAFPG...,oxidoreductase activity [GO:0016491],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
420563,Q3ZZJ0,284,MYKKIGIIYHPLNPAACDLAIKLTAKLDSLGIENWSDSAWQADKLT...,ATP binding [GO:0005524]; metal ion binding [G...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."


In [9]:
def clean_GO_terms(term):
    """returns array cleaned for GO sequencing, splits and removes 'GO:...' """
    term_array = term.split(';')
    cleaned_array = [x[:-12] for x in term_array]
    stripped_array = [x.strip() for x in cleaned_array]
    return cleaned_array
#string_array = train.gene_ontology.head(n=100).iloc[3].split(';')

train['gene_ontology'] = train['gene_ontology'].apply(clean_GO_terms)
train.head(n=20)

Unnamed: 0,entry,length,sequence,gene_ontology,sequence_encoded,sequence_encoded_same_len
527365,Q5NI92,286,MMKQKTIAKEFSVTGVGLHSGVDVSMTVKPADIDSGIVFRRADLTP...,"[metal ion binding , UDP-3-O-[3-hydroxymyrist...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
519549,Q8DI53,426,MNIAVIGLSHKTAPVDVREKLSVPEDVRERALQHLCGYAHIQEATI...,"[glutamyl-tRNA reductase activity , NADP bind...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
82351,B1AJ36,316,MAIVSVKLTSNQVDKFLILWRPYLVHNNHNYAKYTFKIDGTLIIIY...,"[magnesium ion binding , RNA binding , RNA-D...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
80192,B0S048,115,MDIIKTLEDEQLRENKFDFHVGDTVKVDYLIKEGNKERVQVYEGTV...,[structural constituent of ribosome ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
33360,B5R0H7,103,MYAVFQSGGKQHRVSEGQTVRLEKLDIATGETIEFAEVLMIANGEE...,"[rRNA binding , structural constituent of rib...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
254022,Q49Z04,342,MSETTLILAIESSCDETSVSIIENGKNILSNIVLSQIESHKRFGGV...,"[iron ion binding , metalloendopeptidase acti...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
146157,B7N1S3,310,MIIVTGGAGFIGSNIVKALNDKGITDILVVDNLKDGTKFVNLVDLD...,[ADP-glyceromanno-heptose 6-epimerase activity...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
209638,P0AB52,117,MQKIVIVANGAPYGSESLFNSLRLAIALREQESNLDLRLFLMSDAV...,[identical protein binding ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
65212,D5VGV2,129,MPKTVITPPGTGTPIAPFSPGTLADGIVYVSGTLAFDKDNNVAFPG...,[oxidoreductase activity ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
420563,Q3ZZJ0,284,MYKKIGIIYHPLNPAACDLAIKLTAKLDSLGIENWSDSAWQADKLT...,"[ATP binding , metal ion binding , NAD+ kina...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."


In [None]:
# #just strip before checkiong top 50
# cool= np.array(['4 iron, 4 sulfur cluster binding',
#  "GTP 3',8'-cyclase activity",
#  'GTP binding',
#  'metal ion binding',
#  'S-adenosyl-L-methionine binding'])
# for x in train.gene_ontology.iloc[1]:
#     if x.strip() in cool:
#         print("cool")
#     else:
#         print('the spaces matter!')
array = [1,2,3,4,5,6]
array.remove(1)
print(array)
train.head()

In [10]:
#have only applied this to train, not test!!!!!!
#dropping sequence encoded column
#train = train.drop(columns=['sequence_encoded'])
listofpopularGO_terms = np.array(['ATP binding','structural constituent of ribosome','metal ion binding','rRNA binding',
                         'DNA binding','magnesium ion binding','zinc ion binding','RNA binding','tRNA binding', 
                         'GTP binding','4 iron, 4 sulfur cluster binding','iron ion binding', 
                         'DNA-binding transcription factor activity','toxin activity', 'rotational mechanism', 
                         'GTPase activity','NAD binding','pyridoxal phosphate binding','heme binding','NADP binding', 
                         'quinone binding','nucleic acid binding','sequence-specific DNA binding','electron transfer activity', 
                         'protein homodimerization activity','protein dimerization activity', 
                         'manganese ion binding','unfolded protein binding','translation elongation factor activity', 
                         'identical protein binding', 'nucleotide binding','NADH dehydrogenase (ubiquinone) activity', 
                         'transferase activity','calcium ion binding','serine-type endopeptidase activity', 
                         'metalloendopeptidase activity','structural molecule activity','oxidoreductase activity', 
                         'translation initiation factor activity',
                         'ATPase activity','protein heterodimerization activity','sulfurtransferase activity', 
                         'ribosome binding','2 iron, 2 sulfur cluster binding','FMN binding','nickel cation binding',
                         'single-stranded DNA binding','large ribosomal subunit rRNA binding', 
                         'protein serine/threonine kinase activity'])

#top 50 popular GO terms the one with the most annotations 
def removeGO_term_not_50(array):
    """removes all the GO terms that are not in the top 50 most annotated list"""
    new_array = [x.strip() for x in array]
    return_array = []
    for x in range(len(new_array)):
        if new_array[x] in listofpopularGO_terms:
            return_array.append(new_array[x])
        else:
            pass
    return return_array
#array = ['ATP binding ','NANA ',' NANA', 'NANA']
#print(removeGO_term_not_50(array))
train['new_gene_ontology'] = train['gene_ontology'].apply(removeGO_term_not_50)
train.head(n=25)

Unnamed: 0,entry,length,sequence,gene_ontology,sequence_encoded,sequence_encoded_same_len,new_gene_ontology
527365,Q5NI92,286,MMKQKTIAKEFSVTGVGLHSGVDVSMTVKPADIDSGIVFRRADLTP...,"[metal ion binding , UDP-3-O-[3-hydroxymyrist...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[metal ion binding]
519549,Q8DI53,426,MNIAVIGLSHKTAPVDVREKLSVPEDVRERALQHLCGYAHIQEATI...,"[glutamyl-tRNA reductase activity , NADP bind...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[NADP binding]
82351,B1AJ36,316,MAIVSVKLTSNQVDKFLILWRPYLVHNNHNYAKYTFKIDGTLIIIY...,"[magnesium ion binding , RNA binding , RNA-D...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[magnesium ion binding, RNA binding]"
80192,B0S048,115,MDIIKTLEDEQLRENKFDFHVGDTVKVDYLIKEGNKERVQVYEGTV...,[structural constituent of ribosome ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[structural constituent of ribosome]
33360,B5R0H7,103,MYAVFQSGGKQHRVSEGQTVRLEKLDIATGETIEFAEVLMIANGEE...,"[rRNA binding , structural constituent of rib...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[rRNA binding, structural constituent of ribos..."
254022,Q49Z04,342,MSETTLILAIESSCDETSVSIIENGKNILSNIVLSQIESHKRFGGV...,"[iron ion binding , metalloendopeptidase acti...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[iron ion binding, metalloendopeptidase activity]"
146157,B7N1S3,310,MIIVTGGAGFIGSNIVKALNDKGITDILVVDNLKDGTKFVNLVDLD...,[ADP-glyceromanno-heptose 6-epimerase activity...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[NADP binding]
209638,P0AB52,117,MQKIVIVANGAPYGSESLFNSLRLAIALREQESNLDLRLFLMSDAV...,[identical protein binding ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[identical protein binding]
65212,D5VGV2,129,MPKTVITPPGTGTPIAPFSPGTLADGIVYVSGTLAFDKDNNVAFPG...,[oxidoreductase activity ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[oxidoreductase activity]
420563,Q3ZZJ0,284,MYKKIGIIYHPLNPAACDLAIKLTAKLDSLGIENWSDSAWQADKLT...,"[ATP binding , metal ion binding , NAD+ kina...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[ATP binding, metal ion binding]"


### Removing empty GO rows in TRAIN!

In [11]:
train = train.drop(columns=['sequence_encoded'])
train = train.drop(columns=['gene_ontology'])
new_train = train[train.astype(str)['new_gene_ontology'] != '[]']
print(new_train.shape)
print(train.shape)

(202526, 5)
(299436, 5)


In [12]:
new_train.head(n=20)

Unnamed: 0,entry,length,sequence,sequence_encoded_same_len,new_gene_ontology
527365,Q5NI92,286,MMKQKTIAKEFSVTGVGLHSGVDVSMTVKPADIDSGIVFRRADLTP...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[metal ion binding]
519549,Q8DI53,426,MNIAVIGLSHKTAPVDVREKLSVPEDVRERALQHLCGYAHIQEATI...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[NADP binding]
82351,B1AJ36,316,MAIVSVKLTSNQVDKFLILWRPYLVHNNHNYAKYTFKIDGTLIIIY...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[magnesium ion binding, RNA binding]"
80192,B0S048,115,MDIIKTLEDEQLRENKFDFHVGDTVKVDYLIKEGNKERVQVYEGTV...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[structural constituent of ribosome]
33360,B5R0H7,103,MYAVFQSGGKQHRVSEGQTVRLEKLDIATGETIEFAEVLMIANGEE...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[rRNA binding, structural constituent of ribos..."
254022,Q49Z04,342,MSETTLILAIESSCDETSVSIIENGKNILSNIVLSQIESHKRFGGV...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[iron ion binding, metalloendopeptidase activity]"
146157,B7N1S3,310,MIIVTGGAGFIGSNIVKALNDKGITDILVVDNLKDGTKFVNLVDLD...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[NADP binding]
209638,P0AB52,117,MQKIVIVANGAPYGSESLFNSLRLAIALREQESNLDLRLFLMSDAV...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[identical protein binding]
65212,D5VGV2,129,MPKTVITPPGTGTPIAPFSPGTLADGIVYVSGTLAFDKDNNVAFPG...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[oxidoreductase activity]
420563,Q3ZZJ0,284,MYKKIGIIYHPLNPAACDLAIKLTAKLDSLGIENWSDSAWQADKLT...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[ATP binding, metal ion binding]"


In [13]:
#actually encoding now-- only new train_new gene_ontology 
listofpopularGO_terms_labels = np.array(['ATP binding','structural constituent of ribosome','metal ion binding','rRNA binding',
                         'DNA binding','magnesium ion binding','zinc ion binding','RNA binding','tRNA binding', 
                         'GTP binding','4 iron, 4 sulfur cluster binding','iron ion binding', 
                         'DNA-binding transcription factor activity','toxin activity', 'rotational mechanism', 
                         'GTPase activity','NAD binding','pyridoxal phosphate binding','heme binding','NADP binding', 
                         'quinone binding','nucleic acid binding','sequence-specific DNA binding','electron transfer activity', 
                         'protein homodimerization activity','protein dimerization activity', 
                         'manganese ion binding','unfolded protein binding','translation elongation factor activity', 
                         'identical protein binding', 'nucleotide binding','NADH dehydrogenase (ubiquinone) activity', 
                         'transferase activity','calcium ion binding','serine-type endopeptidase activity', 
                         'metalloendopeptidase activity','structural molecule activity','oxidoreductase activity', 
                         'translation initiation factor activity',
                         'ATPase activity','protein heterodimerization activity','sulfurtransferase activity', 
                         'ribosome binding','2 iron, 2 sulfur cluster binding','FMN binding','nickel cation binding',
                         'single-stranded DNA binding','large ribosomal subunit rRNA binding', 
                         'protein serine/threonine kinase activity'])

#label encoding, just changes it to numbers
label_encoder = LabelEncoder()
label_encoder.fit(listofpopularGO_terms_labels)

# onehot encoder
onehot_encoder = OneHotEncoder(sparse=False, categories = [range(49)], dtype=int)

def one_hot_encoder(my_array):
    """function to turn sequence onehot encoded"""
    integer_encoded = label_encoder.transform(my_array)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded
new_train['GO_encoded'] = new_train['new_gene_ontology'].apply(one_hot_encoder)
new_train.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,entry,length,sequence,sequence_encoded_same_len,new_gene_ontology,GO_encoded
527365,Q5NI92,286,MMKQKTIAKEFSVTGVGLHSGVDVSMTVKPADIDSGIVFRRADLTP...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[metal ion binding],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
519549,Q8DI53,426,MNIAVIGLSHKTAPVDVREKLSVPEDVRERALQHLCGYAHIQEATI...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[NADP binding],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,..."
82351,B1AJ36,316,MAIVSVKLTSNQVDKFLILWRPYLVHNNHNYAKYTFKIDGTLIIIY...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[magnesium ion binding, RNA binding]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
80192,B0S048,115,MDIIKTLEDEQLRENKFDFHVGDTVKVDYLIKEGNKERVQVYEGTV...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[structural constituent of ribosome],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
33360,B5R0H7,103,MYAVFQSGGKQHRVSEGQTVRLEKLDIATGETIEFAEVLMIANGEE...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[rRNA binding, structural constituent of ribos...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [33]:
#finding the max number of functions 
#don't need to output
def find_len(df):
    abs_max = 0
    for index, row in df.iterrows():
        max_num = len(row.GO_encoded)
        if max_num > abs_max:
            abs_max = max_num
        else:
            pass
    return abs_max
#print(find_len(new_train))
array= [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]



49


### making GO encoded the same length (making it a 8 X 49 matrix)

In [34]:
def make_all_GO_same_length(my_array):
    """function to make all the sequence encoded arrays into the same length"""
    if len(my_array) == 8:
        return my_array
    else:
        b= [[0]* 49] * (8 - len(my_array))
        same_length_array = np.concatenate((my_array,b), axis = 0)
        return same_length_array
new_train['GO_encoded'] = new_train['GO_encoded'].apply(make_all_GO_same_length)
new_train.head(n=25)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,entry,length,sequence,sequence_encoded_same_len,new_gene_ontology,GO_encoded
527365,Q5NI92,286,MMKQKTIAKEFSVTGVGLHSGVDVSMTVKPADIDSGIVFRRADLTP...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[metal ion binding],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
519549,Q8DI53,426,MNIAVIGLSHKTAPVDVREKLSVPEDVRERALQHLCGYAHIQEATI...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[NADP binding],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,..."
82351,B1AJ36,316,MAIVSVKLTSNQVDKFLILWRPYLVHNNHNYAKYTFKIDGTLIIIY...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[magnesium ion binding, RNA binding]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
80192,B0S048,115,MDIIKTLEDEQLRENKFDFHVGDTVKVDYLIKEGNKERVQVYEGTV...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[structural constituent of ribosome],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
33360,B5R0H7,103,MYAVFQSGGKQHRVSEGQTVRLEKLDIATGETIEFAEVLMIANGEE...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[rRNA binding, structural constituent of ribos...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
254022,Q49Z04,342,MSETTLILAIESSCDETSVSIIENGKNILSNIVLSQIESHKRFGGV...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[iron ion binding, metalloendopeptidase activity]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
146157,B7N1S3,310,MIIVTGGAGFIGSNIVKALNDKGITDILVVDNLKDGTKFVNLVDLD...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[NADP binding],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,..."
209638,P0AB52,117,MQKIVIVANGAPYGSESLFNSLRLAIALREQESNLDLRLFLMSDAV...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[identical protein binding],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
65212,D5VGV2,129,MPKTVITPPGTGTPIAPFSPGTLADGIVYVSGTLAFDKDNNVAFPG...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[oxidoreductase activity],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
420563,Q3ZZJ0,284,MYKKIGIIYHPLNPAACDLAIKLTAKLDSLGIENWSDSAWQADKLT...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[ATP binding, metal ion binding]","[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


## Starting the actual CNN now!!

In [37]:
# define parameters 