In [4]:
import numpy as np
import pandas as pd
import re

## Initial Data Import

In [5]:
train_variant = pd.read_csv("training_variants.txt")
test_variant = pd.read_csv("test_variants.txt")
train_text = pd.read_csv("training_text.txt", sep="\|\|", engine='python', header=None, skiprows=1, 
                         names=["ID","Text"])
test_text = pd.read_csv("test_text.txt", sep="\|\|", engine='python', header=None, skiprows=1, 
                        names=["ID","Text"])

train = pd.merge(train_variant, train_text, how='left', on='ID')
train_y = train['Class'].values
train_x = train.drop('Class', axis=1)

test_x = pd.merge(test_variant, test_text, how='left', on='ID')

all_data = pd.DataFrame(np.concatenate((train_x, test_x), axis=0))
all_data.columns = ["ID", "Gene", "Variation", "Text"]

## Corpus Tokenization and Vectorization

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
#from nltk.stem import porter
from gensim.models.doc2vec import TaggedDocument
from gensim import utils

stops = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
#ps = PorterStemmer()

def split_stop_stem(text):
    text = tokenizer.tokenize(text)
    text = [w for w in text if not w in stops]
    #text = list(map(lambda x: ps.stem(x), text))
    return text

def tagDocs(text):
    sentences=[]
    for index, row in text.iteritems():
        sentences.append(TaggedDocument(row, ['Text' + '_%s' % str(index)]))
    return sentences


In [7]:
words = all_data['Text'].apply(split_stop_stem)

In [8]:
words = tagDocs(words)

In [9]:
words[0]

TaggedDocument(words=['Cyclin', 'dependent', 'kinases', 'CDKs', 'regulate', 'variety', 'fundamental', 'cellular', 'processes', 'CDK10', 'stands', 'one', 'last', 'orphan', 'CDKs', 'activating', 'cyclin', 'identified', 'kinase', 'activity', 'revealed', 'Previous', 'work', 'shown', 'CDK10', 'silencing', 'increases', 'ETS2', 'v', 'ets', 'erythroblastosis', 'virus', 'E26', 'oncogene', 'homolog', '2', 'driven', 'activation', 'MAPK', 'pathway', 'confers', 'tamoxifen', 'resistance', 'breast', 'cancer', 'cells', 'The', 'precise', 'mechanisms', 'CDK10', 'modulates', 'ETS2', 'activity', 'generally', 'functions', 'CDK10', 'remain', 'elusive', 'Here', 'demonstrate', 'CDK10', 'cyclin', 'dependent', 'kinase', 'identifying', 'cyclin', 'M', 'activating', 'cyclin', 'Cyclin', 'M', 'orphan', 'cyclin', 'product', 'FAM58A', 'whose', 'mutations', 'cause', 'STAR', 'syndrome', 'human', 'developmental', 'anomaly', 'whose', 'features', 'include', 'toe', 'syndactyly', 'telecanthus', 'anogenital', 'renal', 'malfor

In [None]:
%%time
import os
from gensim.models import Doc2Vec
import multiprocessing
import Cython

Text_INPUT_DIM=300

text_model=None
filename='doc2vecNoStemSkipGram'
#if os.path.isfile(filename):
text_model = Doc2Vec.load(filename)
#else:
    #text_model = Doc2Vec(min_count=1, window=6, size=Text_INPUT_DIM, sample=1e-5, negative=5,
    #                     workers=7, dbow_words=1, iter=5,seed=1)
    #text_model.build_vocab(words)
    #text_model.train(words, total_examples=text_model.corpus_count, epochs=text_model.iter)
    #text_model.save(filename)

## Gene and Variation Featurization

In [None]:
def orig_amino(text):
    if text.upper() == text:
        return text[0]
    else:
        return 0

def mutated_amino(text):
    if text.upper() == text:
        return text[-1]
    else:
        return 0

In [None]:
all_data['Original_Amino'] = all_data['Variation'].apply(orig_amino)
all_data.Original_Amino.head()

In [None]:
all_data['Mutated_Amino'] = all_data['Variation'].apply(mutated_amino)
all_data.Mutated_Amino.head()

In [None]:
all_data

In [None]:
amino = pd.read_csv('Amino_Acids.csv')
amino

In [None]:
aminoFeatures = pd.merge(all_data[['ID', 'Original_Amino', 'Mutated_Amino']],
         amino[['Letter', 'Isoelectric_P', 'Hydrophobicity']],
         how = 'left', left_on= 'Original_Amino', right_on = 'Letter')

aminoFeatures = aminoFeatures.rename(columns = {'Isoelectric_P': 'orig_elec', 'Hydrophobicity': 'orig_hydro'})

aminoFeatures = pd.merge(aminoFeatures,amino[['Letter', 'Isoelectric_P', 'Hydrophobicity']],
         how = 'left', left_on= 'Mutated_Amino', right_on = 'Letter')

In [None]:
aminoFeatures['Elec_Diff'] = abs(aminoFeatures['orig_elec'].subtract(aminoFeatures['Isoelectric_P']))
aminoFeatures['Hydro_Diff'] = abs(aminoFeatures['orig_hydro'].subtract(aminoFeatures['Hydrophobicity']))
aminoFeatures = aminoFeatures.drop(['Original_Amino', 'Mutated_Amino', 'orig_elec', 'orig_hydro', 
                                    'Isoelectric_P','Hydrophobicity', 'ID'], axis=1).fillna(0)\
            .rename(columns = {'Letter_x': 'Orig_Amino', 'Letter_y': 'Mut_Amino'})
                            

In [None]:
aminoFeatures.head()

In [None]:
aminoFeatures = pd.get_dummies(aminoFeatures)
aminoFeatures.head()

## Joining Features and Doc2Vec Arrays

In [None]:
train_size=len(train_x)
test_size=len(test_x)

text_train_arrays = np.zeros((train_size, Text_INPUT_DIM))
text_test_arrays = np.zeros((test_size, Text_INPUT_DIM))

for i in range(train_size):
    text_train_arrays[i] = text_model.docvecs['Text_'+str(i)]

j=0
for i in range(train_size,train_size+test_size):
    text_test_arrays[j] = text_model.docvecs['Text_'+str(i)]
    j=j+1
    
text_train_arrays.shape

In [None]:
train_set=np.hstack((aminoFeatures[:train_size], text_train_arrays))
test_set=np.hstack((aminoFeatures[train_size:], text_test_arrays))

In [None]:
train_set.shape

In [None]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(train_y)
encoded_y = np_utils.to_categorical((label_encoder.transform(train_y)))
encoded_y

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Input, RepeatVector

def baseline_model():
    model = Sequential()
    model.add(Dense(512, input_dim=train_set.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.6))
    model.add(Dense(512, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(512, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(128, kernel_initializer='normal', activation='relu'))
    model.add(Dense(9, kernel_initializer='normal', activation="softmax"))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = baseline_model()

In [None]:
estimator=model.fit(train_set, encoded_y, validation_split=0.2, epochs=10, batch_size=64)

In [None]:
print("Training accuracy: %.2f%% / Validation accuracy: %.2f%%" % \
      (100*estimator.history['acc'][-1], 100*estimator.history['val_acc'][-1]))

In [65]:
y_pred = model.predict_proba(text_test_arrays)



In [66]:
test_index = test_x['ID'].values

submission = pd.DataFrame(y_pred)
submission['id'] = test_index
submission.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'id']
submission.to_csv("submission_all.csv",index=False)