In [2]:
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
#data processing
path = 'data_rct/'
df_train_txt = pd.read_csv(path + 'training_text', sep='\|\|', header=None, skiprows=1, names=["ID","Text"])
df_train_var = pd.read_csv(path + 'training_variants')
df_test_txt = pd.read_csv(path + 'stage2_test_text.csv', sep='\|\|', header=None, skiprows=1, names=["ID","Text"])
df_test_var = pd.read_csv(path + 'stage2_test_variants.csv')
df_train = pd.merge(df_train_var, df_train_txt, how='left', on='ID')
df_test = pd.merge(df_test_var, df_test_txt, how='left', on='ID')
col = ['ID', 'Gene', 'Variation', 'Text', 'Class']
df_train = df_train.loc[:, col]
df_train.head()

  This is separate from the ipykernel package so we can avoid doing imports until
  """


Unnamed: 0,ID,Gene,Variation,Text,Class
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,1
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,2
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,2
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,3
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,4


In [5]:
title = df_train['Gene'].values + ' ' + df_train['Variation'].values
title = title.reshape(len(title), 1).tolist()

In [6]:
content = df_train['Text'].values
content = content.reshape(len(content), 1).tolist()

In [7]:
cla = df_train['Class'].values
cla = cla.reshape(len(cla), 1).tolist()

In [8]:
title_t = df_test_var['Gene'].values + ' ' + df_test_var['Variation'].values
title_t = title_t.reshape(len(title_t), 1).tolist()
content_t = df_test_txt['Text'].values
content_t = content_t.reshape(len(content_t), 1).tolist()

In [9]:
class Sentence:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 2 # Count SOS and EOS
      
    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [10]:
tit = Sentence('title')
con = Sentence('content')
for t, c in zip (title, content):
    c = ''.join(c)
#     print(c)
    t = ''.join(t)
#     print(t)
    tit.index_words(t)
    con.index_words(c)
    
for t, c in zip (title_t, content_t):
    c = ''.join(c)
#     print(c)
    t = ''.join(t)
#     print(t)
    tit.index_words(t)
    con.index_words(c)

print('tit: {}'.format(tit.n_words))
print('con: {}'.format(con.n_words))

tit: 4344
con: 548088


In [11]:
def indexes_from_sentence(lang, sentence):
    
    return [lang.word2index[word] for word in sentence.split(' ')]

def variable_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    var = Variable(torch.LongTensor(indexes).view(-1, 1)).cuda()
#     if USE_CUDA: var = var.cuda()
    return var

def variables_from_pair(t, c):
    title = variable_from_sentence(tit, t)
    content = variable_from_sentence(con, c)
    return (title, content)

In [12]:
class CNNText(nn.Module): 
    def __init__(self):
        super(CNNText, self).__init__()
        self.encoder_tit = nn.Embedding(4344, 64)
        self.encoder_con = nn.Embedding(548088, 256)
        self.test = nn.Parameter(torch.LongTensor(0))
        
        self.title_conv_1 = nn.Sequential(
            nn.Conv1d(in_channels = 64,
                      out_channels = 1,
                      kernel_size = 1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=1),
        )
        
        self.title_conv_2 = nn.Sequential(
            nn.Conv1d(in_channels = 64,
                      out_channels = 1,
                      kernel_size = 2),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=1),
        )

        self.content_conv_3 = nn.Sequential(
            nn.Conv1d(in_channels = 256,
                      out_channels = 1,
                      kernel_size = 3),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = 50)
        )
        
        self.content_conv_4 = nn.Sequential(
            nn.Conv1d(in_channels = 256,
                      out_channels = 1,
                      kernel_size = 4),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = 50)
        )
            
        self.content_conv_5 = nn.Sequential(
            nn.Conv1d(in_channels = 256,
                      out_channels = 1,
                      kernel_size = 5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = 50)
        )
        
        
            
        self.fc = nn.Linear(32, 9)

    def forward(self, title, content):
        title = self.encoder_tit(title)
        title = title.permute(0,2,1)
#         print(title.size())
#         print(title.unsqueeze(1).size())
#         title_rehape = torch.index_select(title, 2, Variable(torch.LongTensor([0,2,1])))
#         print("title_reshape size is {}".format(title_rehape.size()))
        title_out_1 = self.title_conv_1(title)
#         print(title_out_1.size())
        title_out_2 = self.title_conv_2(title)
#         print(title_out_2.size())
        
        content = self.encoder_con(content)
        content = content.permute(0,2,1)
#         self.test = torch.LongTensor(50)
#         content_reshape = torch.index_select(content, 2, Variable(torch.LongTensor([0,2,1])))
#         print(content.size())
        content_out_3 = self.content_conv_3(content)
#         print(content_out_3.size())
        
        content_out_4 = self.content_conv_4(content)
#         print(content_out_4.size())
        
        content_out_5 = self.content_conv_5(content)
#         print(content_out_5.size())
        
            
        conv_out = torch.cat((
            title_out_1,
            title_out_2,
            content_out_3,
            content_out_4,
            content_out_5
        ),dim=2)
        
#         print(conv_out.size())
        if conv_out.size()[2] < 32:
            fix = Variable(torch.randn(conv_out.size()[0], conv_out.size()[1], 32)).cuda()
#             print(type(fix))
#             print(type(conv_out))
            conv_out = torch.cat((conv_out, fix),dim=2)
#         print(conv_out.size())
        conv_out, _ = conv_out.topk(32, dim=2)
#         print(conv_out.size())
#         kernel_size = conv_out.size()[2] - 255
#         print(kernel_size)
#         print(conv_out.size())
#         conv_out = torch.cat((content_out_3,content_out_4,content_out_5),dim=1)
#         print(conv_out.squeeze(0).size())
        logits = self.fc(conv_out.squeeze(0))
#         print(logits.size())
#         print(logits.size())
#         print(logits)
#         print(F.softmax(logits.squeeze(0)))
#         print(F.log_softmax(logits.squeeze(0)))
        return F.log_softmax(logits.squeeze(0))


In [13]:
cnnt = CNNText()
cnnt.cuda()
optimizer = optim.Adam(cnnt.parameters(), lr=.001)
Loss = nn.NLLLoss()

for epoch in range(60):
    loss = 0
    
    t = ''.join(title[epoch])
    c = ''.join(content[epoch])
    T, C = variables_from_pair(t, c)
    
    T = T.squeeze(1).unsqueeze(0)
    C = C.squeeze(1).unsqueeze(0)
    optimizer.zero_grad()
    
    out = cnnt(T, C)
#     print(out)
#     print(torch.LongTensor(cla[epoch]))
    target = Variable(torch.LongTensor(cla[epoch])).cuda()
    
    print(target.size())
#     print(out)
#     print(target)
    loss += Loss(out, target)
    
    loss.backward()
    optimizer.step()
    
    if not epoch % 10:
        print("Loss is {} at {} epoch".format(loss, epoch))

torch.Size([1])
Loss is Variable containing:
 1.7140
[torch.cuda.FloatTensor of size 1 (GPU 0)]
 at 0 epoch
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
Loss is Variable containing:
 2.6773
[torch.cuda.FloatTensor of size 1 (GPU 0)]
 at 10 epoch
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
Loss is Variable containing:
 2.3251
[torch.cuda.FloatTensor of size 1 (GPU 0)]
 at 20 epoch
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
Loss is Variable containing:
 1.4495
[torch.cuda.FloatTensor of size 1 (GPU 0)]
 at 30 epoch
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch.Size([1])
torch

In [14]:
title_t = df_test_var['Gene'].values + ' ' + df_test_var['Variation'].values
title_t = title_t.reshape(len(title_t), 1).tolist()
content_t = df_test_txt['Text'].values
content_t = content_t.reshape(len(content_t), 1).tolist()

In [47]:
for epoch in range(986):
    loss = 0
    
    t = ''.join(title_t[epoch])
    c = ''.join(content_t[epoch])
    T, C = variables_from_pair(t, c)
    
    T = T.squeeze(1).unsqueeze(0)
    C = C.squeeze(1).unsqueeze(0)
    
#     print(C.size())
#     print(C.size())
    
    out = cnnt(T, C)
    
    val, k = out.max(0)
    k = k.data[0]
    
    if epoch == 0:
        res = np.eye(9)[:, k-1]
#         print(res)
        
    else:
        k_c = np.eye(9)[:, k-1]
        res = np.vstack((res, k_c))
#         print(res)
        
        
print(res)
    
    
#     print(out)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [52]:
submission_df = pd.DataFrame(res.astype(int), columns=['class'+str(c+1) for c in range(9)])
submission = pd.concat([df_test['ID'], submission_df], axis=1)
submission

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,1,0,0,0,1,0,0,0,0,0
1,2,0,0,0,1,0,0,0,0,0
2,3,0,0,0,1,0,0,0,0,0
3,4,0,0,0,1,0,0,0,0,0
4,5,0,0,0,1,0,0,0,0,0
5,6,0,0,0,1,0,0,0,0,0
6,7,0,0,0,1,0,0,0,0,0
7,8,0,0,0,1,0,0,0,0,0
8,9,0,0,0,1,0,0,0,0,0
9,10,0,0,0,1,0,0,0,0,0


In [55]:
submission.to_csv('submission2.csv', index=False)

In [54]:
submission.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,1,0,0,0,1,0,0,0,0,0
1,2,0,0,0,1,0,0,0,0,0
2,3,0,0,0,1,0,0,0,0,0
3,4,0,0,0,1,0,0,0,0,0
4,5,0,0,0,1,0,0,0,0,0
