In [12]:
import numpy as np
import random
import math
import collections
import gensim
from gensim.models import Word2Vec

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import re

In [32]:
class Config:
    max_sentence_num = 500
    max_sentence_len= 100
    vocab_size = 100000
    
config = Config()

In [25]:
def load_data(data_path, config):
    """
    载入数据
    """
    data= []
    labels = []
    max_sentence_num = 0
    max_sentence_len = 0
    longest_sentence = ""
    with open(data_path, 'r') as f:
        for line in f:
            line_list = line.split('\t')
            sentences = re.split(r'[。|；|！|？|，|（|）]', line_list[1])
            tmp_num = len(sentences)
            if tmp_num > max_sentence_num:
                max_sentence_num = tmp_num
            if tmp_num > config.max_sentence_num:
                continue
            one_data = []
            for s in sentences:
                words = s.split(' ')
                tmp_len = len(words)
                if tmp_len > max_sentence_len:
                    max_sentence_len = tmp_len
                    longest_sentence = words
                one_data.append(words)
            data.append(one_data[:config.max_sentence_len])
            labels.append(int(line_list[2]))
        f.close()
    print("max sentence num:", max_sentence_num)
    print("max sentence length: ", max_sentence_len)
    return data, labels

In [28]:
data_path = '../data/seg_sample_train.txt'
data, labels = load_data(data_path, config)

max sentence num: 2250
max sentence length:  257


In [31]:
print(data[0])

[['公诉', '机关', '北京市', '昌平区', '人民检察院', ''], ['', '被告人', '呼', '&', 'times', ';', '&', 'times', ';', ''], ['', '男', ''], ['', '27', '岁', ''], ['', '1986', '年', '11', '月', '10', '日', '出生', ''], ['', ''], ['', '因涉嫌', '犯', '盗窃罪', '于', '2014', '年', '6', '月', '20', '日', '被', '羁押', ''], ['', '同年', '7', '月', '3', '日', '被', '逮捕', ''], ['', '现', '羁押于', '北京市', '昌平区', '看守所', ''], ['', '北京市', '昌平区', '人民检察院', '以京昌检', '公诉', '刑诉', ''], ['', '2014', ''], ['', '610', '号', '起诉书', '指控', '被告人', '呼', '&', 'times', ';', '&', 'times', ';', '犯', '盗窃罪', ''], ['', '于', '2014', '年', '7', '月', '22', '日向', '本院', '提起公诉', ''], ['', '本院', '依法', '适用', '简易程序', ''], ['', '实行', '独任', '审判', ''], ['', '公开', '开庭', '进行', '了', '审理', ''], ['', '北京市', '昌平区', '人民检察院', '指派', '检察员', '夏', '文广', '出庭', '支持', '公诉', ''], ['', '被告人', '呼', '&', 'times', ';', '&', 'times', ';', '到庭', '参加', '诉讼', ''], ['', '本案', '现已', '审理', '终结', ''], ['', '北京市', '昌平区', '人民检察院', '起诉书', '指控', '：', '被告人', '呼', '&', 'times', ';', '&', 'times', ';', '于', '2014', '

In [44]:
def build_voabulary(data, vocab_size):
    """
    基于所有数据构建词表
    """
    count = [['UNK', -1]]
    words = []
    for doc in data:
        for sentence in doc:
            words.extend(sentence)
    count.extend(collections.Counter(words).most_common(vocab_size - 1))
    dict_word2index = dict()
    for word, _ in count:
        dict_word2index[word] = len(dict_word2index)
    dict_index2word = dict(zip(dict_word2index.values(), dict_word2index.keys()))
    
    return  count, dict_word2index, dict_index2word

In [45]:
count, dict_word2index, dict_index2word = build_voabulary(data, config.vocab_size)

In [50]:
def build_dataset(data, labels, dict_word2index, config):
    """
    基于词表构建数据集（数值化）
    """
    dataset = []
    indices = np.arange(len(labels))
    np.random.shuffle(indices)
    new_labels = []
    for i in indices:
        new_labels.append(labels[i]-1) 
        new_doc = []
        for sentence in data[i]:
            new_sentence = []
            for word in sentence:
                if word in dict_word2index:
                    index = dict_word2index[word]
                else:
                    index = 0    # UNK
                new_sentence.append(index)
            zero_num = config.max_sentence_len - len(new_sentence)
            if zero_num > 0:
                new_sentence.extend([0]*zero_num)
            new_doc.append(new_sentence[:config.max_sentence_len])
    
        zero_num = config.max_sentence_num - len(new_doc)
        while zero_num > 0:
            new_doc.append([0]*config.max_sentence_len)
            zero_num -= 1
        dataset.append(new_doc[:config.max_sentence_num])
#     return dataset, new_labels
    return np.array(dataset, dtype=np.int64), np.array(new_labels, dtype=np.int64)

In [None]:
dataset, labels = build_dataset(data, labels, dict_word2index, config)