In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import plotly
plotly.tools.set_credentials_file(username='quoniammm', api_key='IF7kV6idFRdoo7LdgGRp')
import plotly.plotly as py
import plotly.graph_objs as go
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

import nltk

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

%matplotlib inline

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

### Hierarchical Attention Networks for Document Classification

In [10]:
# bag_words
all_words = set(train['text'].str.split(expand=True).unstack())

def wordandindex(vocab):
    return {word: i + 3 for i, word in enumerate(vocab)}, {i + 3: word for i, word in enumerate(vocab)}

word2index, index2word = wordandindex(all_words)

label_encoder = LabelEncoder()
# 数据集准备
X = np.array(train.text.apply(lambda sen: [word2index[word] for word in sen.split(' ')]))
y = np.array(label_encoder.fit_transform(train.author))
assert len(X) == len(y)
print(len(all_words))
#print(len(train))
print("test length: {}".format(len(test)))
# 句子填充
X_pad = np.zeros((19579, 861))

for i in range(X_pad.shape[0]):
    for j in range(len(X[i])):
        X_pad[i, j] = X[i][j]

xtrain, xvalid, ytrain, yvalid = train_test_split(
    X_pad, y, 
    stratify=y, 
    random_state=42, 
    test_size=0.1, 
    shuffle=True
)
print("train length: {}".format(len(xtrain)))
#print(xtrain.type)
print("valid length: {}".format(len(xvalid)))
#print(xvalid.type)

# 最长句子长度设置为 input_size
max = 0
for i, x in enumerate(X):
    # print(len(x))
    if len(x) > max:
        max = len(x)
        
print(max)
# train.iloc[9215].values

epochs = 1
lr = 1e-4

47557
test length: 8392
train length: 17621
valid length: 1958
861


In [7]:
# Functions to accomplish attention
def batch_matmul_bias(seq, weight, bias):
    s = None
    bias_dim = bias.size()
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight) 
        _s_bias = _s + bias.expand(bias_dim[0], _s.size()[0]).transpose(0,1)
        _s_bias = torch.tanh(_s_bias)
        _s_bias = _s_bias.unsqueeze(0)
        if(s is None):
            s = _s_bias
        else:
            s = torch.cat((s,_s_bias),0)
    return s

def batch_matmul(seq, weight):
    s = None
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight)
        _s = _s.unsqueeze(0)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)
    return s.squeeze()

def attention_mul(rnn_outputs, att_weights):
    attn_vectors = None
    for i in range(rnn_outputs.size(0)):
        h_i = rnn_outputs[i]
        a_i = att_weights[i].unsqueeze(1).expand_as(h_i)
        h_i = a_i * h_i
        h_i = h_i.unsqueeze(0)
        if(attn_vectors is None):
            attn_vectors = h_i
        else:
            attn_vectors = torch.cat((attn_vectors,h_i),0)
    return torch.sum(attn_vectors, 0)

In [8]:
# Hierarchical Attention Networks for Document Classification
class AttentionWordRNN(nn.Module):
    def __init__(self, batch_size, num_tokens, embed_size, word_gru_hidden, n_classes):
        super(AttentionWordRNN, self).__init__()
        
        self.num_tokens = num_tokens
        self.embed_size = embed_size
        self.word_gru_hidden = word_gru_hidden
        self.batch_size = batch_size
        
        # (N, W) => (N, W, embed_size)
        self.lookup = nn.Embedding(num_tokens, embed_size)
        # (seq_len, batch, input_size) + (num_layers * num_directions, batch, hidden_size)
        # => (seq_len, batch, hidden_size * num_directions) + (num_layers * num_directions, batch, hidden_size)
        self.word_gru = nn.GRU(embed_size, word_gru_hidden, 2, bidirectional= True)
        self.softmax_word = nn.Softmax(dim=1)
        self.final_softmax = nn.LogSoftmax(dim=1)
        # ???
        self.final_linear = nn.Linear(2*word_gru_hidden, n_classes)
        
        self.weight_W_word = nn.Parameter(torch.Tensor(2*word_gru_hidden, 2*word_gru_hidden))
        self.bias_word = nn.Parameter(torch.Tensor(2*word_gru_hidden,1))
        self.weight_proj_word = nn.Parameter(torch.Tensor(2*word_gru_hidden, 1))
        
        self.weight_W_word.data.uniform_(-0.1, 0.1)
        self.weight_proj_word.data.uniform_(-0.1,0.1)
    
    def forward(self, x, state_word):
        # embeddings
        embedded = self.lookup(x)
        embedded_resize = embedded.view(-1, self.batch_size, self.embed_size)
        # word level gru
        output_word, state_word = self.word_gru(embedded_resize, state_word)
        # print(output_word.size())
        word_hidden = batch_matmul_bias(output_word, self.weight_W_word, self.bias_word)
        # print(word_hidden.size())
        word_similarity = batch_matmul(word_hidden, self.weight_proj_word)
        # print(word_similarity.size())
        word_weights = self.softmax_word(word_similarity.transpose(1, 0))
        # print(word_weights)
        sen_vector = attention_mul(output_word, word_weights.transpose(1, 0))
        # print(sen_vector)
        final_linear = self.final_linear(sen_vector)
        out = self.final_softmax(final_linear)
        
        return F.softmax(final_linear, dim=1), out, state_word
    
    def init_hidden(self):
        return Variable(torch.zeros(4, self.batch_size, self.word_gru_hidden)).cuda()

In [9]:
# 训练
model = AttentionWordRNN(16, 47560, 256, 128, 3)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.NLLLoss()
model.cuda()

In [None]:
epochs = 1
# 训练模型
for epoch in range(epochs):
    vx = Variable(torch.LongTensor(xtrain.astype(int))).cuda()
    vy = Variable(torch.LongTensor(ytrain)).cuda()
    optimizer.zero_grad()
    for i in range(0, len(xtrain), 16):
        if i + 16 > len(xtrain):
            vx_batch = vx[-17:-1]
            vy_batch = vy[-17:-1]
        else:
            vx_batch = vx[i:i+16]
            vy_batch = vy[i:i+16]
            
        hidden = model.init_hidden()
        results, outputs, _ = model(vx_batch, hidden)
        loss = criterion(outputs, vy_batch)
        loss.backward()
        optimizer.step()
        
        _, res = results.data.max(1)
        print("after {}% training loss is {}".format(float(i) / len(xtrain), loss.data[0]))
        #print((torch.sum(res == vy_batch.data) + 0.0) / 16.0)