In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
accelerator

In [0]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import re
from copy import deepcopy
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import PackedSequence,pack_padded_sequence
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

flatten = lambda l: [item for sublist in l for item in sublist]

In [0]:
# 安装 PyDrive 操作库，该操作每个 notebook 只需要执行一次
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 授权登录，仅第一次的时候会鉴权
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# 列出根目录的所有文件
# "q" 查询条件教程详见：https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s, mimeType: %s' % (file1['title'], file1['id'], file1["mimeType"]))

In [0]:
# '目录 id' in parents
file_list = drive.ListFile({'q': "'16zfnt8Cnuznzwj2RHGrxvyfyA6iQPRXx' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s, mimeType: %s' % (file1['title'], file1['id'], file1["mimeType"]))

In [0]:
file = drive.CreateFile({'id': "1py3JAFOLOH2lM-lOqYyP_LfqaImlOqL_"}) 
#这里的下载操作只是缓存，不会在你的Google Drive 目录下多下载一个文件
file.GetContentFile('data.csv', "text/csv") 

In [0]:
df = pd.read_csv('data.csv', sep='\t', names=['number', "sen1", "sen2", "label"], skipinitialspace=True)

In [0]:
df.head(2)

In [0]:
def normalize_string(s):
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [0]:
%%time

X1_r = list(map(normalize_string, df.sen1.tolist()))
X2_r = list(map(normalize_string, df.sen2.tolist()))
y_r = df.label.tolist()
print(len(X1_r), len(X2_r), len(y_r))
print(X1_r[0], "@@@@", X2_r[0], "@@@@",y_r[0])

In [0]:
vocab = list(set(flatten(X1_r + X2_r)))
len(vocab)

In [0]:
source2index = {'<PAD>':0,'<UNK>':1,'<s>':2,'</s>':3}
for vo in vocab:
    if vo not in source2index.keys():
        source2index[vo]=len(source2index)
index2source = {v:k for k,v in source2index.items()}

In [0]:
index2source[source2index[","]]

In [0]:
EPOCH=50
BATCH_SIZE = 64
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 256
LR = 0.01
DECODER_LEARNING_RATIO=5.0
RESCHEDULED=False

In [0]:
USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [0]:
USE_CUDA

In [0]:
LongTensor

In [0]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [0]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<unk>"], seq))
    return LongTensor(idxs)

In [0]:
%%time
X1_p, X2_p = [],[]
ta_p = []

for s1, s2, ta in zip(X1_r, X2_r, y_r):
    X1_p.append(prepare_sequence(s1,source2index).view(1,-1))
    X2_p.append(prepare_sequence(s2,source2index).view(1,-1))
    ta_p.append(ta)
    
data = list(zip(X1_p, X2_p, ta_p))
random.shuffle(data)
train_data = data[:30000]
test_data = data[30000:]

print(train_data[0])

In [0]:
def pad_to_batch(batch,x_to_ix):
    
    sorted_batch =  sorted(batch, key=lambda b:b[0].size(1),reverse=True) # sort by len
    
    x1, x2, y = list(zip(*sorted_batch))
    max_x1 = max([s.size(1) for s in x1])
    max_x2 = max([s.size(1) for s in x2])
    
    x1_p, x2_p, y_p = [],[],[]
    for i in range(len(batch)):
      
        if x1[i].size(1)<max_x1:
            x1_p.append(torch.cat( [ Variable(x1[i]),Variable(LongTensor([x_to_ix['<PAD>']]*(max_x1-x1[i].size(1)))).view(1,-1) ],1))
        else:
            x1_p.append(Variable(x1[i]))
        
        if x2[i].size(1)<max_x2:
            #v = Variable(LongTensor([x_to_ix['<PAD>']]*(max_x2-x2[i].size(1)))).view(1,-1)
            #print(torch.cat((Variable(x2[i]), v), 1))
            x2_p.append(torch.cat( [ Variable(x2[i]),Variable(LongTensor([x_to_ix['<PAD>']]*(max_x2-x2[i].size(1)))).view(1,-1) ],1))
        else:
            x2_p.append(Variable(x2[i]))
        
    x1_var = torch.cat(x1_p, 0)
    x2_var = torch.cat(x2_p, 0)
    target_var = Variable(LongTensor(y))
    x1_len = [list(map(lambda s: s ==0, t.data)).count(False) for t in x1_var]
    x2_len = [list(map(lambda s: s ==0, t.data)).count(False) for t in x2_var]
    
    return x1_var, x2_var, target_var, x1_len, x2_len

In [0]:
batch = next(getBatch(BATCH_SIZE,train_data))

In [0]:
pbatch = pad_to_batch(batch,source2index)

In [0]:
pbatch[0]

In [0]:
class EncoderV(nn.Module):
    def __init__(self, input_size, embedding_size,hidden_size, n_layers=1,bidirec=False):
        super(EncoderV, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        
        if bidirec:
            self.n_direction = 2 
            self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True,bidirectional=True)
        else:
            self.n_direction = 1
            self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True)
    
    def init_hidden(self,inputs):  # input.size(0) = batch_size
        hidden = Variable(torch.zeros(self.n_layers*self.n_direction,inputs.size(0),self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
    
    def init_weight(self):
        self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight)
        self.gru.weight_hh_l0 = nn.init.xavier_uniform(self.gru.weight_hh_l0)
        self.gru.weight_ih_l0 = nn.init.xavier_uniform(self.gru.weight_ih_l0)
    
    def forward(self, x, x_len):
        """
        sequence -> sort -> pad and pack ->process using RNN -> unpack ->unsort

        :param x: Variable
        :param x_len: numpy list
        :return:
        """
        
        hidden = self.init_hidden(x)
        
        """sort"""
        x_sort_idx = np.argsort(x_len)[::-1]
        x_unsort_idx = LongTensor(np.argsort(x_sort_idx))
        x_len = np.array(x_len)[x_sort_idx]
        x = x[LongTensor(x_sort_idx.copy())]
      
        embedded = self.embedding(x)
        
        """pack"""
        x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(embedded, x_len, batch_first=True)
                
        """process using RNN"""
        out_pack, ht = self.gru(x_emb_p, hidden)
        
        """unsort: h"""
        ht = torch.transpose(ht, 0, 1)[
            x_unsort_idx]  # (num_layers * num_directions, batch, hidden_size) -> (batch, ...)
        ht = torch.transpose(ht, 0, 1)

        #print("ht", ht.shape)
        if self.n_layers>1:
            if self.n_direction==2:
                ht = ht[-2:]
                return out_pack, torch.cat((ht[0], ht[1]),1)
            else:
                ht = ht[-1]
                return out_pack, ht

     

In [0]:
class Model(nn.Module):
    
    def __init__(self, encoder1, encoder2, hidden_size):
        
        super(Model, self).__init__()
        self.encoder1 = encoder1
        
        self.fc1 = nn.Linear(hidden_size*4, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 2)
        
    def init_weight(self):
        
        self.encoder1.init_weight()
        #self.fc1.weight = nn.init.xavier_uniform(self.fc1.weight)
        #self.fc2.weight = nn.init.xavier_uniform(self.fc2.weight)
        
    def forward(self, sen1, sen2, sen1_lengths, sen2_lengths):
             
        outputs_1, hidden_c1 = encoder1(sen1,sen1_lengths)
        outputs_2, hidden_c2 = encoder1(sen2,sen2_lengths)
        
        hidden = torch.cat((hidden_c1, hidden_c2), 1).squeeze(1)  # batch * 2hidden
        out = self.fc1(hidden)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [0]:
encoder1 = EncoderV(len(source2index),EMBEDDING_SIZE,HIDDEN_SIZE,2,True)
if USE_CUDA:
    encoder1 = encoder1.cuda()
    
model = Model(encoder1, encoder1, HIDDEN_SIZE)
model.init_weight()

if USE_CUDA:
    model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=LR)

In [0]:
model.parameters

In [0]:
for i,p in enumerate(model.parameters()):
    if p.requires_grad:
        print(i, ":", p.shape)

In [0]:
list(model.parameters())[17]

In [0]:
list(model.parameters())[5]

In [0]:
### test 
for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
    
    # sen1 = batch * l
    sen1, sen2, targets,sen1_lengths, sen2_lengths = pad_to_batch(batch,source2index)
    print("sen1 size: ", sen1.shape)
    #input_masks = torch.cat([Variable(ByteTensor(tuple(map(lambda s: s ==0, t.data)))) for t in sen1]).view(sen1.size(0),-1)
    model.zero_grad()
    optimizer.zero_grad()
    
    # Forward + Backward + Optimize
    outputs = model(sen1, sen2, sen1_lengths, sen2_lengths)
    print('outputs', outputs.size())
    print('targets', targets.size())
    loss = criterion(outputs, targets)
    print('loss', loss)
    loss.backward()
    optimizer.step()
    
    break

In [0]:
total_step = round(len(train_data) / BATCH_SIZE)

for epoch in range(EPOCH):
    
    total_loss = []
    
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        sen1, sen2, targets,sen1_lengths, sen2_lengths = pad_to_batch(batch,source2index)
        model.zero_grad()
        optimizer.zero_grad()

        # Forward + Backward + Optimize
        outputs = model(sen1, sen2, sen1_lengths, sen2_lengths)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        total_loss.append(loss.data[0])

        if (i+1) % 20 == 0:
                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                       .format(epoch+1, EPOCH, i+1, total_step, loss.data[0]))
     
    print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, EPOCH, np.mean(total_loss)))
    
    if RESCHEDULED == False and epoch  == EPOCH//2:
        LR *= 0.01
        optimizer = optim.Adam(model.parameters(), lr=LR)
        RESCHEDULED = True

Epoch [13/50], Step [420/469], Loss: 0.5049
Epoch [13/50], Step [440/469], Loss: 0.5048


In [0]:
def infer(sens):
    
    outputs = model(sens[0], sens[1], [sens[0].size(1)], [sens[1].size(1)])  
    return np.argmax(outputs)

In [0]:
#test = random.choice(train_data)
#test = random.sample(train_data, 5)

sen1, sen2, targets,sen1_lengths, sen2_lengths = pad_to_batch(test,source2index)
outputs = model(sen1, sen2, sen1_lengths, sen2_lengths)

predict = np.array(np.argmax(outputs.data, 1))
targets = np.array(targets.data)
print("Predict:", predict)
print("Truth:", targets)
print("Correct: ", np.sum(predict == targets))


In [0]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score


print("confusion_matrix: \n" ,confusion_matrix(predict, targets)
print(precision_score(predict, targets))
print(recall_score(predict, targets))
print(f1_score(predict, targets))

## tn fp
## fn tp