In [1]:
#lets start by importing a bunch of stuff
import tensorflow as tf
import pandas as pd
import numpy as np
import math 

In [2]:
# Downloading and separating data

#explictily setting the types and names
names_data = ['entry','entry_name','protein_name','gene_name','organism','length','sequence',
              'gene_ontology','status','organism_id','keywords']

dtypes_data = {'entry':'str','entry_name':'str','protein_name':'str','gene_name':'str',
               'organism':'str','length':'int','sequence':'str','gene_ontology':'str','status':'str',
               'organism_id':'int','keywords':'str'}


protein_data = pd.read_csv('uniprot-filtered-reviewed_yes.tab',sep='\t',names = names_data,dtype=dtypes_data,skiprows=1)


#Seeing what the data looks like
protein_data.shape

#randomly shuffling dataset 

protein_data = protein_data.sample(frac=1)



In [3]:
#Ignoring protein's with ambigous amino acid codes 

protein_data = protein_data[~protein_data.sequence.str.contains('B')]
protein_data = protein_data[~protein_data.sequence.str.contains('O')]
protein_data = protein_data[~protein_data.sequence.str.contains('J')]
protein_data = protein_data[~protein_data.sequence.str.contains('U')]
protein_data = protein_data[~protein_data.sequence.str.contains('X')]
protein_data = protein_data[~protein_data.sequence.str.contains('Z')]

protein_data.shape


(556604, 11)

In [4]:
index_data_nan= protein_data.loc[protein_data.gene_ontology.isnull()].index
protein_data = protein_data.drop(index_data_nan)

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

protein_data = protein_data.drop(columns=['gene_name', 'organism_id', 'status', 'protein_name', 'entry_name', 'organism', 'keywords'])

index_data = protein_data.loc[protein_data.length > 500].index
protein_data = protein_data.drop(index_data)


#encoding the sequence to a one-hot encoding scheme 
labels = np.array(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'])


#label encoding - integer encoding 
label_encoder = LabelEncoder()
label_encoder.fit(labels)

# onehot encoder
onehot_encoder = OneHotEncoder(sparse=False, categories = [range(20)], dtype=int)


def one_hot_encoder(my_string):
    """function to turn sequence onehot encoded"""
    my_array = np.array(list(my_string)) # converts string into array
    integer_encoded = label_encoder.transform(my_array)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded

#testing custom onehot_encoder
# test_sequence = protein_data.sequence[0]
# string_to_array(test_sequence)
# one_hot_encoder(test_sequence)

#applying onehot_encoder to entire dataframe(takes 6 minutes)
protein_data['sequence_encoded'] = protein_data['sequence'].apply(one_hot_encoder)


In [6]:
#filtering the length of the sequences, dropping sequences greater than 6000

max_length = int(protein_data.length.max())# 2000

def make_all_same_length(my_array):
    """function to make all the sequence encoded arrays into the same length"""
    if len(my_array) == 500:
        return my_array
    else:
        b= [[0]* 20] * (500 - len(my_array))
        same_length_array = np.concatenate((my_array,b), axis = 0)
        return same_length_array

protein_data['sequence_encoded_same_len'] = protein_data['sequence_encoded'].apply(make_all_same_length)

In [7]:
#Dividing into 80 percent training and 20 percent test
train, test = np.split(protein_data, [int(.8*len(protein_data))])

#removing GO labels and keywords from the test dataset
test = test.drop(columns=['gene_ontology'])

### Don't output the cells below

In [None]:
# example  = str(train.gene_ontology.iloc[3])
# print(example)
# example_array = np.array(example.split(";"))
# list_example = [x.strip() for x in example_array]
# list_example
# built a dictionary of unique GO MF terms , just in training dataset we have 8272 terms GO MF
max_len_GO = 0
dictionary_uniq_GO = {}
for index, row in train.iterrows():
    for x in row.gene_ontology.split(";"):
        x=x[:-12]
        x=x.strip()
        if x in dictionary_uniq_GO:
            dictionary_uniq_GO[x] += 1
        else:
            dictionary_uniq_GO[x] = 1
    else:
        pass
#print(dictionary_uniq_GO)
#getting top 50 terms for GO 


### Begining one hot encoding the GO Terms 

In [24]:
def clean_GO_terms(term):
    """returns array cleaned for GO sequencing, splits and removes 'GO:...' """
    term_array = term.split(';')
    cleaned_array = [x[:-12] for x in term_array]
    stripped_array = [x.strip() for x in cleaned_array]
    return cleaned_array
string_array = train.gene_ontology.head(n=100).iloc[3].split(';')

train['gene_ontology'] = train['gene_ontology'].apply(clean_GO_terms)
train.head()

['ATP binding [GO:0005524]', ' GMP synthase (glutamine-hydrolyzing) activity [GO:0003922]']
['ATP binding', 'GMP synthase (glutamine-hydrolyzing) activity']


Unnamed: 0,entry,length,sequence,gene_ontology,sequence_encoded,sequence_encoded_same_len
43262,B1L9P8,70,MAKRCEVCGKAPRSGNTVSHSDKKSGRWFRPNLQKVRVVLPDGTIK...,[structural constituent of ribosome ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
505187,E0UR96,368,MSLNLQTYTEHFTNPLYLESGRILEPYDITYETYGTMNEDKSNVVV...,[homoserine O-acetyltransferase activity ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
531560,Q8VD31,451,MGLEPSWYLLLCLAVSGAAGTDPPTAPTTAERQRQPTDIILDCFLV...,[protein-containing complex binding ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
493478,O26805,186,MFMILIINNHGQYNHRIHRTLRYLQIPSELVPNTTPLDEIISRDPE...,"[ATP binding , GMP synthase (glutamine-hydrol...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."
306310,Q2RUI7,176,MVNSMPKPFLTPGTLYCGAAPAVISTVLGSCVAVCLIDRHNRAAGM...,[protein-glutamine glutaminase activity ],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,..."


In [27]:
listofpopularGO_terms = np.array(['ATP binding','structural constituent of ribosome','metal ion binding','rRNA binding',
                         'DNA binding','magnesium ion binding','zinc ion binding','RNA binding,tRNA binding', 
                         'GTP binding','4 iron, 4 sulfur cluster binding','iron ion binding', 
                         'DNA-binding transcription factor activity','toxin activity, rotational mechanism', 
                         'GTPase activity','NAD binding','pyridoxal phosphate binding','heme binding','NADP binding', 
                         'quinone binding','nucleic acid binding','sequence-specific DNA binding','electron transfer activity', 
                         'protein homodimerization activity','protein dimerization activity', 
                         'manganese ion binding','unfolded protein binding','translation elongation factor activity', 
                         'identical protein binding', 'nucleotide binding','NADH dehydrogenase (ubiquinone) activity', 
                         'transferase activity','calcium ion binding','serine-type endopeptidase activity', 
                         'metalloendopeptidase activity','structural molecule activity','oxidoreductase activity', 
                         'translation initiation factor activity',
                         'ATPase activity,protein heterodimerization activity','sulfurtransferase activity', 
                         'ribosome binding','2 iron, 2 sulfur cluster binding','FMN binding','nickel cation binding',
                         'single-stranded DNA binding','large ribosomal subunit rRNA binding', 
                         'protein serine/threonine kinase activity'])

#top 50 popular GO terms the one with the most annotations 
def removeGO_term_not_50(array):
    """removes all the GO terms that are not in the top 50 most annotated list"""
    for x in array:
        if x in listofpopularGO_terms:
            pass
        else:
            array.loc(x)
    return array
pd_test = train.head(50)
pd_test['new_gene_ontology'] = pd_test['gene_ontology'].apply(removeGO_term_not_50)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,entry,length,sequence,gene_ontology,sequence_encoded,sequence_encoded_same_len,new_gene_ontology
43262,B1L9P8,70,MAKRCEVCGKAPRSGNTVSHSDKKSGRWFRPNLQKVRVVLPDGTIK...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
505187,E0UR96,368,MSLNLQTYTEHFTNPLYLESGRILEPYDITYETYGTMNEDKSNVVV...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
531560,Q8VD31,451,MGLEPSWYLLLCLAVSGAAGTDPPTAPTTAERQRQPTDIILDCFLV...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
493478,O26805,186,MFMILIINNHGQYNHRIHRTLRYLQIPSELVPNTTPLDEIISRDPE...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
306310,Q2RUI7,176,MVNSMPKPFLTPGTLYCGAAPAVISTVLGSCVAVCLIDRHNRAAGM...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
56009,Q2IJ92,156,MPRRREVEKRKILPDPKFQDRIVAKFVNNLMRKGKKSTGERIIYGA...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
134332,P39075,278,MKESYYSIGEVSKLANVSIKALRYYDKIDLFKPAYVDPDTSYRYYT...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
558768,B0V529,267,MSRLATRFEKLQSQQRKALVSYVMAGDPQPQVTVPLLHKMVAAGVD...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
185591,Q6LZF1,326,MREVLLSECIDLLYESHFVISKPFGRSCFDLIAKKGNLRLLIKILK...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
45516,Q1CVF3,156,MQIIEGKLQLQGNERVAILTSRFNHIITDRLKEGAMDCFKRHGGDE...,[],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",[]
