In [1]:
import numpy as np
import pandas as pd
import re

In [1]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7839835278651976814
, name: "/gpu:0"
device_type: "GPU"
memory_limit: 3239362560
locality {
  bus_id: 1
}
incarnation: 1407110988627921285
physical_device_desc: "device: 0, name: GeForce GTX 970, pci bus id: 0000:01:00.0"
]


In [1]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13784806051234217679
]


## Initial Data Import

In [2]:
train_variant = pd.read_csv("training_variants.txt")
test_variant = pd.read_csv("test_variants.txt")
train_text = pd.read_csv("training_text.txt", sep="\|\|", engine='python', header=None, skiprows=1, 
                         names=["ID","Text"])
test_text = pd.read_csv("test_text.txt", sep="\|\|", engine='python', header=None, skiprows=1, 
                        names=["ID","Text"])

train = pd.merge(train_variant, train_text, how='left', on='ID')
train_y = train['Class'].values
train_x = train.drop('Class', axis=1)

test_x = pd.merge(test_variant, test_text, how='left', on='ID')

all_data = pd.DataFrame(np.concatenate((train_x, test_x), axis=0))
all_data.columns = ["ID", "Gene", "Variation", "Text"]

In [78]:
len(set(all_data.Gene))

1507

In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
#from nltk.stem import porter
from gensim.models.doc2vec import TaggedDocument
from gensim import utils

stops = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
#ps = PorterStemmer()

def split_stop_stem(text):
    text = tokenizer.tokenize(text)
    text = [w for w in text if not w in stops]
    #text = list(map(lambda x: ps.stem(x), text))
    return text

def tagDocs(text):
    sentences=[]
    for index, row in text.iteritems():
        sentences.append(TaggedDocument(row, ['Text' + '_%s' % str(index)]))
    return sentences


Using TensorFlow backend.


In [None]:
words = all_data['Text'].apply(split_stop_stem)

In [None]:
words = tagDocs(words)

In [None]:
words[0]

In [3]:
import os
from gensim.models import Doc2Vec

Text_INPUT_DIM=500

text_model=None
filename='doc2vecv1'
if os.path.isfile(filename):
    text_model = Doc2Vec.load(filename)
else:
    text_model = Doc2Vec(min_count=1, window=5, size=Text_INPUT_DIM, sample=1e-4, negative=5, workers=4, iter=5,seed=1)
    text_model.build_vocab(tagged_words)
    text_model.train(tagged_words, total_examples=text_model.corpus_count, epochs=text_model.iter)
    text_model.save(filename)

Using TensorFlow backend.


In [79]:
text_model.create_binary_tree()

In [4]:
train_size=len(train_x)
test_size=len(test_x)

text_train_arrays = np.zeros((train_size, Text_INPUT_DIM))
text_test_arrays = np.zeros((test_size, Text_INPUT_DIM))

for i in range(train_size):
    text_train_arrays[i] = text_model.docvecs['Text_'+str(i)]

j=0
for i in range(train_size,train_size+test_size):
    text_test_arrays[j] = text_model.docvecs['Text_'+str(i)]
    j=j+1
    
print(text_train_arrays[0][:50])

[-0.17183802  0.09460621  0.37183738 -0.22421546  0.10470325 -0.32804403
 -0.15180039  0.65924126 -0.45024005 -0.5853194  -0.1135942   0.41056389
 -0.0533164  -0.33100063  0.17151758 -0.39235571 -0.27376065  0.20376949
  0.45964777  0.14374289  0.23027714 -0.31553769 -0.56025499 -0.43051165
 -0.11980943 -0.24646237  0.28339759 -0.37458807 -0.71096194  0.29309732
 -0.19722864  1.16739643  0.05945602  0.55526525 -0.53061008 -0.05811989
 -0.78908908 -0.21285698  0.0707021   0.03890851 -0.18036473 -0.04776842
  0.55835503  0.20685136  0.37613997  0.41337186  0.29474714 -0.06769666
 -0.13391761  0.08491859]


In [50]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(train_y)
encoded_y = np_utils.to_categorical((label_encoder.transform(train_y)))
encoded_y

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [68]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Input, RepeatVector
from keras.optimizers import SGD

def baseline_model():
    model = Sequential()
    model.add(Dense(512, input_dim=Text_INPUT_DIM, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(9, kernel_initializer='normal', activation="softmax"))
    
    #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)  
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = baseline_model()

In [69]:
estimator=model.fit(text_train_arrays, encoded_y, validation_split=0.2, epochs=10, batch_size=64)

Train on 2656 samples, validate on 665 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [75]:
print("Training accuracy: %.2f%% / Validation accuracy: %.2f%%" % \
      (100*estimator.history['acc'][-1], 100*estimator.history['val_acc'][-1]))

Training accuracy: 81.10% / Validation accuracy: 35.34%


In [67]:
y_pred = model.predict_proba(text_test_arrays)



In [31]:
test_index = test_x['ID'].values

submission = pd.DataFrame(y_pred)
submission['id'] = test_index
submission.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'id']
submission.to_csv("submission_all.csv",index=False)

## Gene and Variation Featurization

In [87]:
np.where(all_data['Variation'].upper() == all_data['Variation'], all_data['Variation'][-1:], 0)

AttributeError: 'Series' object has no attribute 'upper'

In [88]:
def check_var(text):
    if text.upper() == 

Truncating Mutations
W802*
Q249E
N454D
L399V
V391I
V430M
Deletion
Y371H
C384R
P395A
K382E
R420Q
C381A
P428L
D390Y
Truncating Mutations
Q367P
M374V
Y371S
H94Y
C396R
G375P
S376F
P417A
H398Y
S2G
Y846C
C228T
H412Y
H876Q
Promoter Mutations
P704S
Amplification
C250T
G1809R
G1809K
D1709E
D1709A
E1705A
D1810A
Truncating Mutations
E1705K
Deletion
T1365M
V648G
T844M
A707T
Promoter Hypermethylation
R1343L
A209T
Y280H
D927G
N510K
F248S
L708P
V995M
Y412F
F74S
R1040L
R453C
R1209W
A1022E
Q984K
T605M
K218T
N1125I
Y35C
Y35N
Y35H
Deletion
Amplification
L234fs
L1273F
Deletion
Amplification
P284L
Q276*
T286A
T283A
I290A
I290R
A77S
A77P
A77T
M90I
Amplification
Overexpression
Truncating Mutations
N45S
R487Q
C41Y
G52R
P83L
S241L
S387Y
TGFBR1*6A
R537P
D404G
P525L
Truncating Mutations
Deletion
R922*
R976H
E1322*
T1219I
G39E
Deletion
R1076C
R1095H
Truncating Mutations
V509A
E946*
F1088Lfs*5
F1088Sfs*2
Deletion
Truncating Mutations
Truncating Mutations
Truncating Mutations
Deletion
N1333Gfs*
K700R
R625C
E622D
H6

T58I
Q61K
G13V
Q61R
G60E
G13D
G12V
Q61H
T50I
G12D
Q61L
L481F
Y472H
Truncating Mutations
Deletion
Truncating Mutations
Deletion
L607I
R698W
R661W
W563L
C712R
N480del
S567L
C706F
Truncating Mutations
Deletion
Amplification
TPR-NTRK1 Fusion
V710A
R342W
Fusions
BCAN-NTRK1 Fusion
NFASC-NTRK1 Fusion
TRKAIII Splice Variant
R342Q
Delta-NTRK1 Fusion
CD74-NTRK1 Fusion
TPM3-NTRK1 Fusion
CHTOP-NTRK1 Fusion
Amplification
TFG-NTRK1 Fusion
LMNA-NTRK1 Fusion
MPRIP-NTRK1 Fusion
M713I
Fusions
R715G
ETV6-NTRK3 Fusion
BTBD1-NTRK3 Fusion
Fusions
CPEB1-NTRK3 Fusion
G623R
Deletion
L239R
T654M
G253C
L63V
G505S
G774V
I638F
Amplification
T654I
Fusions
NSD1-NUP98 Fusion
G325A
Truncating Mutations
Deletion
CASP8L
C248T
Promoter Hypermethylation
RET-CCDC6 Fusion
A883T
C609Y
T338I
C634R
S891A
E632_L633del
C618R
C611Y
K603Q
D631G
E921K
Fusions
S904F
L790F
R873Q
Y806C
S765P
V804L
R897Q
D631A
A883F
C634S
A919V
C634W
M980T
E768D
KIF5B-RET Fusion
R833C
V804M
M918V
C630R
V648I
I852M
C620R
C634Y
V804G
R886W
F893L
Y791F
R1

R354P
D211G
I268T
F2120L
W2065R
M407V
M453T
F458V
R286C
G363V
R1443G
P217A
W960R
G247R
V752M
RABEP1-PDGFRB Fusion
R849W
L397P
S141R
P284R
E167D
N641Y
G322D
L467P
N913S
Y44C
F647S
S110L
A20P
R237W
E402Q
V66G
W36R
E132D
N47H
I221T
V2050L
R182H
K18E
R124H
Y491C
L233P
I222N
E608D
Y277D
P63H
T59A
R277W
I477S
R243H
Q230K
L539S
T303R
R98W
K1103R
L389S
D1595N
R386P
I198T
S487P
R76Q
T339M
P569R
L329R
G259R
G885R
G446E
T411I
R84C
A280V
G302R
I445M
R53C
A484E
I322M
R161P
R569H
H119L
R66C
L111P
R280C
C236F
Y133C
L152M
V54L
G327R
Y37C
Q660E
S106P
D86G
R895C
I344T
R283W
M539I
V572L
C177Y
G162D
C118R
M152T
C561G
L48P
P549S
E390G
R2310K
S2138Y
L79R
N229S
R337Q
F1286S
K89R
G40R
R1341Q
D351G
C1149R
R1042W
P388T
V235A
L171R
E476K
Y276C
R92W
A328P
P317R
D374Y
H1464P
N314D
D444N
F252S
G569R
R263L
I182T
G330D
R196W
P90L
Y134H
G81A
V239M
E251G
A603fs
C234Y
A153V
H310Q
G95R
V235F
Y222D
R348C
C109S
G286S
L353P
F167I
D337V
G139V
Q6H
E72D
I1233N
Y71H
A140P
L131P
G375A
F60L
V102I
Y495C
A110V
C291Y
D868A
C253S
Y62