In [38]:
#lets start by importing a bunch of stuff
import tensorflow as tf
import pandas as pd
import numpy as np
import math 

In [39]:
# Downloading and separating data

#explictily setting the types and names
names_data = ['entry','entry_name','protein_name','gene_name','organism','length','sequence',
              'gene_ontology','status','organism_id','keywords']

dtypes_data = {'entry':'str','entry_name':'str','protein_name':'str','gene_name':'str',
               'organism':'str','length':'int','sequence':'str','gene_ontology':'str','status':'str',
               'organism_id':'int','keywords':'str'}


protein_data = pd.read_csv('uniprot-filtered-reviewed_yes.tab',sep='\t',names = names_data,dtype=dtypes_data,skiprows=1)


#Seeing what the data looks like
protein_data.shape

#randomly shuffling dataset 

protein_data = protein_data.sample(frac=1)



In [40]:
#Ignoring protein's with ambigous amino acid codes 

protein_data = protein_data[~protein_data.sequence.str.contains('B')]
protein_data = protein_data[~protein_data.sequence.str.contains('O')]
protein_data = protein_data[~protein_data.sequence.str.contains('J')]
protein_data = protein_data[~protein_data.sequence.str.contains('U')]
protein_data = protein_data[~protein_data.sequence.str.contains('X')]
protein_data = protein_data[~protein_data.sequence.str.contains('Z')]

protein_data.shape


(556604, 11)

In [41]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#encoding the sequence to a one-hot encoding scheme 
labels = np.array(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'])


#label encoding - integer encoding 
label_encoder = LabelEncoder()
label_encoder.fit(labels)

# onehot encoder
onehot_encoder = OneHotEncoder(sparse=False, categories = [range(20)], dtype=int)


def one_hot_encoder(my_string):
    """function to turn sequence onehot encoded"""
    my_array = np.array(list(my_string)) # converts string into array
    integer_encoded = label_encoder.transform(my_array)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded

#testing custom onehot_encoder
test_sequence = protein_data.sequence[0]
string_to_array(test_sequence)
one_hot_encoder(test_sequence)

#applying onehot_encoder to entire dataframe(takes 6 minutes)
protein_data['sequence_encoded'] = protein_data['sequence'].apply(one_hot_encoder)


In [86]:
#print(protein_data.head(1))

#filtering the length of the sequences, dropping sequences greater than 6000
index_data = protein_data.loc[protein_data.length > 6000].index
protein_data = protein_data.drop(index_data)


def make_all_same_length(my_array):
    """function to make all the sequence encoded arrays into the same length"""
    max_length = protein_data.length.max() # 5900
    
#Dividing into 80 percent training and 20 percent test
train, test = np.split(protein_data, [int(.8*len(protein_data))])

#removing GO labels and keywords from the test dataset
test = test.drop(columns=['gene_ontology','keywords'])

(556511, 12)
5900
