In [1]:
import numpy as np
import random
import math
import collections

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

ImportError: /home/ubuntu/anaconda3/lib/python3.6/site-packages/torch/lib/libtorch.so.1: undefined symbol: nvrtcGetProgramLogSize

In [3]:
def load_data(data_path):
    """
    载入数据
    """
    data= []
    labels = []
    max_sentence_len = 0
    with open(data_path, 'r') as f:
        for line in f:
            line_list = line.split('\t')
            one_data = line_list[1].split(' ')
            tmp_len = len(one_data)
            if tmp_len > max_sentence_len:
                max_sentence_len = tmp_len
            data.append(one_data)
            labels.append(int(line_list[2]))
        f.close()
    print("max sentence length: ", max_sentence_len)
    return data, labels

data_path = '../data/seg_sample_train.txt'
data, labels = load_data(data_path)

max sentence length:  20420


In [4]:
from itertools import groupby

def show_text_len_distribution(data):
    len_list = [len(text) for text in data]
#     print(len_list[1:100])
    step = 500
    for k, g in groupby(sorted(len_list), key=lambda x: (x-1)//step):
    #    dic['{}-{}'.format(k*step+1, (k+1)*step)] = len(list(g))
        print('{}-{}'.format(k*step+1, (k+1)*step)+":"+str(len(list(g))))
show_text_len_distribution(data)

1-500:6015
501-1000:2406
1001-1500:647
1501-2000:303
2001-2500:206
2501-3000:125
3001-3500:70
3501-4000:55
4001-4500:37
4501-5000:33
5001-5500:19
5501-6000:17
6001-6500:9
6501-7000:7
7001-7500:5
7501-8000:6
8001-8500:7
8501-9000:4
9001-9500:7
9501-10000:5
10501-11000:2
11501-12000:1
12001-12500:2
12501-13000:3
14501-15000:3
15001-15500:1
16001-16500:1
17001-17500:1
17501-18000:2
20001-20500:1


In [5]:
def build_voabulary(data, vocabulary_size=50000):
    """
    基于所有数据构建词表
    """
    count = [['UNK', -1]]
    words = []
    for line in data:
        words.extend(line)
    for line in data:
        words.extend(line)
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dict_word2index = dict()
    for word, _ in count:
        dict_word2index[word] = len(dict_word2index)
    dict_index2word = dict(zip(dict_word2index.values(), dict_word2index.keys()))
    
    return  count, dict_word2index, dict_index2word

count, dict_word2index, dict_index2word = build_voabulary(data, vocabulary_size=100000)

In [6]:
def build_dataset(data, labels, dict_word2index, max_sentence_len=1000, label_size=8):
    """
    基于词表构建数据集（数值化）
    """
    dataset = []
    indices = np.arange(len(labels))
    np.random.shuffle(indices)
    new_labels = []
    for i in indices:
        new_labels.append(labels[i]-1) 
        new_line = []
        for word in data[i]:
            if word in dict_word2index:
                index = dict_word2index[word]
            else:
                index = 0    # UNK
            new_line.append(index)
        
        zero_num = max_sentence_len - len(new_line)
        while zero_num > 0:
            new_line.append(0)
            zero_num -= 1
        dataset.append(new_line[:max_sentence_len])
#     return dataset, new_labels
    return np.array(dataset, dtype=np.int64), np.array(new_labels, dtype=np.int64)

train_data, train_labels = build_dataset(data, labels, dict_word2index, max_sentence_len=1000)

In [7]:
print(train_labels[:10])

[4 2 6 6 0 4 2 6 2 3]


In [8]:
def split_data(data, radio=0.7):
    """
    将训练集分给为训练集和检验集
    """
    split_index = int(len(data) * 0.7)
    new_data1 = data[ : split_index]
    new_data2 = data[split_index : ]
    return new_data1, new_data2

train_X, valid_X = split_data(train_data)
train_y, valid_y = split_data(train_labels)
print(train_X.shape)
print(train_y.shape)

(7000, 1000)
(7000,)


In [9]:
from torch.utils import data

class MingLueData(data.Dataset):
    
    def __init__(self, X, y):
        self.len = X.shape[0]
        self.x_data = X
        self.y_data = y
    
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

batch_size = 4
num_workers = 2
dataset = MingLueData(train_X, train_y)
train_loader = data.DataLoader(dataset=dataset, 
                               batch_size=batch_size, 
                               shuffle=False,
                               num_workers=num_workers)
dataset = MingLueData(valid_X, valid_y)
valid_loader = data.DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers)

In [10]:
class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_class):
        super(FastText, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.num_class = num_class
        
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size, 
                                      embedding_dim=self.embedding_size)
        self.linear = nn.Linear(in_features=self.embedding_size, 
                                out_features=self.num_class)
    
    def forward(self, text):
        embed = self.embedding(text)
        text_embed = torch.mean(embed, dim=1)
#         print(text_embed.size())
        text_embed = text_embed.view(-1, text_embed.size(2))
        logits = self.linear(text_embed)
        return logits    

vocab_size = 100000
embedding_size = 128
num_class = 8
fast_text = FastText(vocab_size=vocab_size, embedding_size=embedding_size,
                    num_class=num_class)
print(fast_text)

FastText (
  (embedding): Embedding(100000, 128)
  (linear): Linear (128 -> 8)
)


In [11]:
import torch.optim as optim

learning_rate = 0.001
loss_fun = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=fast_text.parameters(), lr=learning_rate)

In [12]:
epoch_num = 3
for epoch in range(epoch_num):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        texts, labels = data
#         print(labels.size())
        inputs, labels = Variable(texts), Variable(labels)
        optimizer.zero_grad()
        outputs = fast_text(inputs)
        loss = loss_fun(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.data[0]
        if i % 100 == 99:
            print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

[1,   100] loss: 1.964
[1,   200] loss: 1.912
[1,   300] loss: 1.881
[1,   400] loss: 1.881
[1,   500] loss: 1.882
[1,   600] loss: 1.820
[1,   700] loss: 1.886
[1,   800] loss: 1.826
[1,   900] loss: 1.810
[1,  1000] loss: 1.814
[1,  1100] loss: 1.852
[1,  1200] loss: 1.825
[1,  1300] loss: 1.789
[1,  1400] loss: 1.845
[1,  1500] loss: 1.809
[1,  1600] loss: 1.817
[1,  1700] loss: 1.775
[2,   100] loss: 1.808
[2,   200] loss: 1.788
[2,   300] loss: 1.774
[2,   400] loss: 1.804
[2,   500] loss: 1.814
[2,   600] loss: 1.734
[2,   700] loss: 1.817
[2,   800] loss: 1.737
[2,   900] loss: 1.733
[2,  1000] loss: 1.728
[2,  1100] loss: 1.750
[2,  1200] loss: 1.734
[2,  1300] loss: 1.690
[2,  1400] loss: 1.755
[2,  1500] loss: 1.726
[2,  1600] loss: 1.722
[2,  1700] loss: 1.681
[3,   100] loss: 1.719
[3,   200] loss: 1.689
[3,   300] loss: 1.670
[3,   400] loss: 1.718
[3,   500] loss: 1.719
[3,   600] loss: 1.622
[3,   700] loss: 1.720
[3,   800] loss: 1.638
[3,   900] loss: 1.643
[3,  1000] 

Process Process-6:
Process Process-5:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/llh/anaconda3/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/llh/anaconda3/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/llh/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/llh/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 35, in _worker_loop
    r = index_queue.get()
  File "/home/llh/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/llh/anaconda3/lib/python3.6/multiprocessing/queues.py", line 342, in get
    with self._rlock:
  File "/home/llh/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/home/llh/anaco

KeyboardInterrupt: 

In [58]:
from collections import Counter


def micro_avg_f1(predict_label, true_label, num_class):
    N = len(predict_label)
    m = num_class
    w = Counter(true_label)
    print(w)
    score = 0
    for i in range(m):
        score += w[i] * f1(predict_label, true_label, i)

    return score / float(N)


def f1(predict_label, true_label, cur_label):
    true_pos, false_pos = 0, 0
    false_neg = 0
    for i in range(len(predict_label)):
        if predict_label[i] == cur_label:
            if true_label[i] == cur_label:
                true_pos += 1
            else:
                false_pos += 1
        else:  # predict_label != cur_label
            if true_label[i] == cur_label:
                false_neg += 1
    if true_pos == 0:
        precision, recall = 0, 0
    else:
        precision = true_pos / float(true_pos + false_pos)
        recall = true_pos / float(true_pos + false_neg)
    if precision == 0 or recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return f1

In [77]:
true_labels = []
predicted_labels = []

for data in valid_loader:
    texts, labels = data
    outputs = fast_text(Variable(texts))
    _, predicted = torch.max(outputs.data, 1)
    true_labels.extend(labels)
    predicted = [i[0] for i in predicted]
    predicted_labels.extend(predicted)

print(true_labels[:10])
print(predicted_labels[:10])
print("Micro-Averaged F1:",micro_avg_f1(predicted_labels, true_labels, num_class))

[0, 1, 0, 5, 6, 4, 1, 1, 5, 6]
[0, 0, 1, 5, 6, 2, 0, 2, 1, 6]
Counter({6: 714, 1: 553, 0: 461, 5: 461, 4: 352, 2: 303, 3: 144, 7: 12})
Micro-Averaged F1: 0.30705623764426043
