# Task description
The task here is to solve a classical supervised classification problem using Word2Vec/FastText with LSTM/GRU based neural network. There are 30 categories for different Chinese chat messages. My task is to freely explore different solutions and evaluate them using accuracies, precisions, and macro F1 score. 

In [1]:
import os
# Check if you're on Google drive or on your own machine.
# Get path to your data.
if ('google' in str(get_ipython())):
    from google.colab import drive
    drive.mount('ME')
    #predir='/content/ME/My Drive/'
    predir='ME/My Drive/'
else:
    predir='Desktop/college/'
    # Please specify your own local directory containing the data and labels
   
import torch
import numpy as np
import copy
# Torch functions 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import time
import numpy as np
import collections
import pickle
import argparse
from random import shuffle

from torch.autograd import Variable
import matplotlib.pylab  as plt
# datadir=predir+'LSDA_data/NLP/'
datadir=predir+'ideepwise/NLP/task1/'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Utility to track progress of a routine.
#from tqdm import tqdm
from tqdm.notebook import trange, tqdm

# Folder with course data
print("TORCH.cuda", torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("datadir:", datadir)
print("get_ipython:", get_ipython())

!python --version
os.getcwd()

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D



Mounted at ME
TORCH.cuda True
datadir: ME/My Drive/ideepwise/NLP/task1/
get_ipython: <google.colab._shell.Shell object at 0x7fe0758089d0>
Python 3.7.13


In [2]:
# load the pretrained word vector
def weight(vocab_set):
  #将词映射为预训练词向量
  #turn the vocabulary into pretrained embedding
  embedding = {}
  size_vocab = len(vocab_set)#字典大小 size of vocabulary
  
  with open(datadir + r'sgns.baidubaike.bigram-char','r',encoding='utf-8') as f:#读取预训练词向量文件 read the pretrained word vector file
    for line_idx, line in enumerate(f):#遍历索引和值，值格式为：词，词向量 iterate indices and lines
      line = line.strip().split()#值 stored values in the original file
      if len(line) != 300 + 1:#保证每个词向量为300维 make sure the word vectors' dimensions are 300
        continue
      word = line[0]#词 word
      word_vector = line[1:]#词向量 vector
      if word in vocab_set:
        embedding[word] = word_vector
  
  # print('word vector obtained：'+str(found)+'all word total count：'+str(size_vocab)+'match rate：{:.2f}%'.format(found/size_vocab*100))
  # 保存提取到的词向量数组 # save the word vectors
  np.savez_compressed(datadir + r'\vec.npz', embedding=embedding)
  return embedding

In [3]:
# data preprocessing
# workflow that separates input data and their labels
def divide_data_and_label(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        file = f.read()
    # print(data)
    data = []
    temp = []
    labels = []
    temp_file = []
    for char in file:
        if char == '\n':
            temp_file.append(temp)
            temp = []    
        elif char == '\s' or char == '\t':
            temp_file.append(temp)
            temp = []
        else:
            temp.append(char)
    for i in range(len(temp_file)):
        if i % 2 == 0:
            data.append(temp_file[i])
        if i % 2 == 1:
            labels.append(temp_file[i])
    
    for i in range(len(data)):
        data[i] = "".join(data[i])
        labels[i] = "".join(labels[i])
    return (data, labels)

In [4]:

data_train, labels_train = divide_data_and_label(os.path.join(datadir, 'train.txt'))
data_dev, labels_dev = divide_data_and_label(os.path.join(datadir, 'dev.txt'))
print(data_train)
print(labels_train)

['新建联系人张三电话号码一二三四五', '黄狼。', '把张玉娟的手机号码发送给吴伟', '潘靖的电话是多少', '新建联系人买当唠4008 517517', '新建联系人天天', '新建联系人王满秀幺三七 六四零七零二九五', '新建联系人王义勇幺三八六零零幺九四九零', '新建联系人孙晓飞幺三八三八四四三八九五', '查找张伟军', '把平水相逢的手机号码发给8000', '把小新的号码发给徐云蔚', '新建联系人振华奎号码是幺三五零八九四九三幺二', '把徐程的电话发给吴芳', '把哥哥的电话发给殷龙', '把李文鼎的号码发给谢服全', '把侨天慧的号码发给副卡', '查看婷婷的号码', '新建联系人张雨号码是五六七', '周慧的电话。', '把唐玉明的手机号码发给李康', '查看王老师的号码', '新建联系人18622625490', '把李志强的号码发给贾洪鉴', '新建联系人姓名张三号码一二三四五六七八', '查寻标兵的号码', '把许琦彪的号码发给徐鹏', '添加一条通讯录姓名葛勇手机号码189571642496', '把王世怀的号码发给乔丽君', '把张新涛电话发给常胜宾', '到太湖怎么走', '去常州武进路线', '湖里公园在哪里', '带我去丹阳市眼镜市场', '从南京到亳州怎么走', '上海到苏州', '美国大使馆在哪', '华晶小学在哪里', '去科大讯飞怎么走', '导航到佛山大良汽车站', '松江钢材城在哪里', '我想到科大讯飞怎么走', '去湘江', '合肥百货大楼在哪里', '往火车北站怎么走', '辽宁大连在哪里', '到步行街不走高架的路线是哪条', '到鼓楼怎么走', '合肥的大润发在哪', '怎么去泉城广场', '到合肥市逍遥津公园怎么走', '沿途有没有加油站', '到环都大酒店怎么走', '到公园怎么走', '我要去姜堰', '六安市飞云卫生院在哪里', '科大讯飞在哪里呀', '从这里到市中心怎么走', '查询北京到上海的路线', '讯飞语点我现在在哪里', '春熙路怎么走', '怎么去龙阳路', '到郑州火车站', '宁波机场在哪里', '从二手车市场到八一广场怎么走', '帮我查一下合肥市三孝口在哪里', '我所在地理位置', '帮我搜一下科大讯飞在哪', '查询沿途的KTV', '郑州火车站怎么走', '鹰潭

In [5]:
# preprocess the data using simple vocabulary
# for each word, we use the ranking of frequencies that the word appears in the entire document as their word vector
def preprocess(data, vocab_to_idx=None):
    counter = collections.Counter()
    for sentence in data:
        counter.update(sentence)
    print(counter)
    count_pairs = sorted(counter.items(), key=lambda x: -x[1])
    chars, _ = zip(*count_pairs)
    vocab_size = len(chars)
    if vocab_to_idx is None:
        vocab_to_idx = dict(zip(chars, range(len(chars))))
        idx_to_vocab = dict(zip(vocab_to_idx.values(), vocab_to_idx.keys()))
    # appending new words in the vocabulary
    for i in range(len(chars)):
        n = len(vocab_to_idx)
        if vocab_to_idx.get(chars[i]) is None:
            vocab_to_idx[chars[i]] = n
            n +=1

    tensor = np.array([list(map(vocab_to_idx.get, data[i])) for i in range(len(data))])
    # print(tensor)
    return vocab_to_idx, tensor
# turn the labels from Chinese string labels into numeric values (0-30 in our case). 
def preprocess_labels(labels, label_dict=None):
    label_set = set(labels) 
    num = len(label_set)
    if label_dict is None:
        i = 0
        label_dict = {}
        for label in label_set:
            label_dict[label] = i
            i += 1
    ret = labels[:]
    for i in range(len(labels)):
        ret[i] = label_dict[labels[i]]
    print(label_dict)
    return ret, num, label_dict

In [6]:
# examples of the vocabulary 
# preprocess the data_train and data_dev into tensor_train, tensor_dev
# each sentence is segmented into individual words, and each word is representated by the frequency that it appears in the document
# so the frequency of each word serves as the word vector

vocab_to_idx, tensor_train = preprocess(data_train)
_, tensor_dev = preprocess(data_dev, vocab_to_idx)
# print(tensor_train)

Counter({'的': 611, '么': 595, '我': 411, '。': 341, '怎': 330, '你': 318, '一': 251, '？': 251, '什': 236, '天': 222, '是': 209, '做': 208, '电': 204, '到': 175, '给': 170, '吗': 143, '下': 139, '查': 136, '有': 132, '打': 125, '看': 122, '新': 118, '车': 115, '说': 114, '在': 107, '上': 103, '首': 102, '想': 96, '发': 94, '不': 94, '开': 94, '，': 94, '视': 92, '影': 91, '票': 90, '个': 88, '好': 85, '哪': 84, '大': 84, '州': 82, '了': 82, '明': 81, '要': 80, '帮': 80, '播': 78, '火': 76, '中': 76, '听': 75, '台': 75, '来': 75, '去': 71, '信': 71, '话': 70, '闻': 69, '股': 69, '南': 67, '气': 67, '道': 64, '啊': 64, '人': 62, '1': 62, '小': 62, '海': 59, '这': 59, '今': 59, '法': 59, '最': 57, '英': 56, '放': 55, '三': 54, '搜': 54, '诗': 54, ' ': 53, '京': 53, '肉': 51, '时': 50, '谁': 49, '短': 49, '零': 48, '广': 48, '网': 48, '飞': 47, '航': 47, '0': 46, '语': 46, '鱼': 46, '节': 46, '机': 45, '国': 45, '班': 45, '号': 44, '方': 44, '吃': 44, '北': 43, '频': 42, '索': 42, '呢': 42, '歌': 42, '二': 41, '乐': 41, '目': 41, '八': 40, '里': 40, '红': 39, '文': 38, '现': 38, '五': 37, '



In [7]:
class_train, num, label_dict = preprocess_labels(labels_train)
class_dev, _, _  = preprocess_labels(labels_dev, label_dict=label_dict)
# print(class_train)
# print(labels_train)


{'email': 0, 'novel': 1, 'video': 2, 'poetry': 3, 'health': 4, 'cookbook': 5, 'weather': 6, 'app': 7, 'epg': 8, 'riddle': 9, 'cinemas': 10, 'bus': 11, 'match': 12, 'map': 13, 'chat': 14, 'message': 15, 'datetime': 16, 'calc': 17, 'lottery': 18, 'tvchannel': 19, 'telephone': 20, 'flight': 21, 'translation': 22, 'schedule': 23, 'contacts': 24, 'stock': 25, 'news': 26, 'train': 27, 'website': 28, 'music': 29, 'radio': 30}
{'email': 0, 'novel': 1, 'video': 2, 'poetry': 3, 'health': 4, 'cookbook': 5, 'weather': 6, 'app': 7, 'epg': 8, 'riddle': 9, 'cinemas': 10, 'bus': 11, 'match': 12, 'map': 13, 'chat': 14, 'message': 15, 'datetime': 16, 'calc': 17, 'lottery': 18, 'tvchannel': 19, 'telephone': 20, 'flight': 21, 'translation': 22, 'schedule': 23, 'contacts': 24, 'stock': 25, 'news': 26, 'train': 27, 'website': 28, 'music': 29, 'radio': 30}


In [8]:
import keras
# using Keras's built-in utilities to_categorical https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical
# turns labels using one-hot 
y_train = keras.utils.np_utils.to_categorical(class_train, num_classes=31)
y_dev = keras.utils.np_utils.to_categorical(class_dev, num_classes=31)

In [None]:
np.random.seed(7)
sequence_train = {len(data_train[i]):i for i in range(len(data_train))}
sorted_sequnece_train = sorted(sequence_train.items(), key=lambda x: -x[0])
sequence_dev = {len(data_dev[i]):i for i in range(len(data_dev))}
sorted_sequnece_dev = sorted(sequence_dev.items(), key=lambda x: -x[0])

max_length_train = max([len(data_train[i])] for i in range(len(data_train)))
max_length_dev = max([len(data_dev[i])] for i in range(len(data_dev)))
max_length = max(max_length_train, max_length_dev)[0]
# display the max_length and the sentence with max_length, which is: 
# 就不告诉你又能咋样我你再跟我那么叫我起床！跟我说句话！估计你给我发个图或者跟你给我发个表情我才能告诉你！要不然你就得跟我换个秘密!
# print(max_length)
# print(data_dev[sorted_sequnece_dev[0][1]])
# print(max_length)

[(48, 447), (37, 438), (32, 1086), (30, 610), (27, 881), (26, 118), (25, 237), (23, 1298), (22, 2229), (21, 2087), (20, 1690), (19, 1532), (18, 2264), (17, 2294), (16, 2284), (15, 2273), (14, 2210), (13, 2272), (12, 2276), (11, 2281), (10, 2296), (9, 2293), (8, 2289), (7, 2297), (6, 2298), (5, 2261), (4, 2259), (3, 2242), (2, 2137)]
[(65, 117), (24, 304), (23, 237), (22, 670), (21, 661), (20, 376), (19, 405), (18, 762), (17, 737), (16, 718), (15, 702), (14, 675), (13, 690), (12, 767), (11, 763), (10, 765), (9, 769), (8, 766), (7, 754), (6, 764), (5, 761), (4, 753), (3, 606), (2, 638)]
65
就不告诉你又能咋样我你再跟我那么叫我起床！跟我说句话！估计你给我发个图或者跟你给我发个表情我才能告诉你！要不然你就得跟我换个秘密！


In [9]:
# pad the embedding vectors by truncating excessive values / appending 0's into the same dimension defined as max_length
def pad_embedding(input, max_length):
    input = sequence.pad_sequences(input, maxlen=max_length)
    return input

In [None]:
print(tensor_train)
tensor_train = pad_embedding(tensor_train, max_length)
tensor_dev = pad_embedding(tensor_dev, max_length)
print(np.array(tensor_train).shape)
print(np.array(tensor_dev).shape)


[list([21, 220, 275, 314, 59, 131, 69, 12, 52, 90, 143, 6, 98, 69, 112, 106])
 list([361, 607, 3])
 list([209, 131, 477, 871, 0, 244, 87, 90, 143, 28, 527, 14, 528, 529])
 ... list([670, 299, 61, 23, 869, 187, 191, 1122, 1123, 870])
 list([2, 42, 20, 391, 754, 61, 23])
 list([869, 187, 191, 1122, 1123, 870])]
(2299, 65)
(770, 65)


In [None]:
class_train = np.array(class_train)
class_dev = np.array(class_dev)
print(class_train.shape)
print(y_train.shape)

(2299,)
(2299, 31)


# Method 1: Word Frequencies
The first method I tried is to initialize each word vector by the word's frequency in the entire document. The word vectors are not great because it does not capture the meaning of the word. For example, it does not demonstrate the numerical similarities between words. To solve this, I added an Embedding layer in my network to try to capture the similarities while training the neural network.

# Network: Simple LSTM
The structure of the network is the state-of-the-art LSTM model. Since the inputs are very sparse, I stick with a rather shallow LSTM model without too much layers. The LSTM layer captures the meaning of the words while taking into account of the position of the word with respect to the sentence without running into the vanishing gradients problem as an naive RNN would. To compensate for overfitting, I also added a few Dropout layers to randomly forget some of the input data. 

# Result:
We achieved 82.21% accuracy in clsssification in the Dev set. In addition, we have precision = 0.7533600233951127, recall score = 0.760744064682589, fbeta score = 0.7510238573390444. 

In [None]:
from sklearn.metrics import precision_recall_fscore_support



embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(2000, embedding_vecor_length, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(128))
# model.add(Bidirectional(LSTM(128, return_sequences=True))
  # model.add(Bidirectional(LSTM(128)))
# model.add(Dropout(0.2))

# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(num, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
print(tensor_train.shape)
output = model.fit(tensor_train, y_train, epochs=50, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(tensor_dev, y_dev, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))
# macro F1:
train_predict = np.argmax(model.predict(tensor_train),axis=1)
dev_predict = np.argmax(model.predict(tensor_dev),axis=1)
print(precision_recall_fscore_support(class_train, train_predict, average='macro'))
# macro F1 score for dev
print(precision_recall_fscore_support(class_dev, dev_predict, average='macro'))
print("all done")



Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 65, 32)            64000     
                                                                 
 conv1d_2 (Conv1D)           (None, 65, 32)            3104      
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 32, 32)           0         
 1D)                                                             
                                                                 
 lstm_17 (LSTM)              (None, 128)               82432     
                                                                 
 dense_7 (Dense)             (None, 31)                3999      
                                                                 
Total params: 153,535
Trainable params: 153,535
Non-trainable params: 0
________________________________________________

总结：于dev set准确率达到82.21%，同时取得 (准确率 = 0.7533600233951127, 召回率 = 0.760744064682589, F1值 = 0.7510238573390444)的评分 

Result: We achieved 82.21% accuracy in clsssification in the Dev set. In addition, we have precision = 0.7533600233951127, recall score = 0.760744064682589, fbeta score = 0.7510238573390444

# *Retrain the model using w2v*

The result is not ideal as there are still room for improvements. I believe the Embedding layer paired with frequencies initialization is not ideal for our purpose, so I decided to apply Word2Vec as our word vectors. 



In [10]:
import logging
from gensim import utils
import gensim.models

# model = gensim.models.Word2Vec(sentence=)
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [None]:
!pip install jieba

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
from keras import datasets
# testing:
from gensim.test.utils import datapath
from gensim import utils
from gensim.models.word2vec import LineSentence
import gensim.models
import jieba

# split Chinese sentence based on Jieba algorithm: https://github.com/fxsjy/jieba

def split_chinese_sentence(data, cut_all=False):
    data_copy = data[:]
    for i in range(len(data)):
      
        data_copy[i] = list(jieba.cut(data[i], cut_all, HMM=False))
    return data_copy

# def pad_batch(tensor, num_vec):
    
#     return 


In [13]:
train_sentences = split_chinese_sentence(data_train, False)
dev_sentences = split_chinese_sentence(data_dev, False)
print("train sentences:", train_sentences)
max_sentences_num = 0
# TODO feature: given variable length dev/validiation, truncate the dev/validation input dimension to fit the network's dimension
# the current preprocessing is done manually, which is not optimal
for sentence in train_sentences:
    max_sentences_num = max(max_sentences_num, len(sentence))

for sentence in dev_sentences:
    max_sentences_num = max(max_sentences_num, len(sentence))

# stitch together the train and dev sentences
sentences = train_sentences + dev_sentences

# apply and train w2v model implemented by gensim
w2v_model = gensim.models.Word2Vec(sentences,  min_count=1)

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

# use pre-defined methods to get the word vectors
w2v_weights = w2v_model.wv.vectors

# for each sentence, get the word vectors of the sentence by looking up the map in gensim w2v model
w2v_tensor_train = np.array([list(map(w2v_model.wv.get_vector, train_sentences[i])) for i in range(len(train_sentences))])
w2v_tensor_dev = np.array([list(map(w2v_model.wv.get_vector, dev_sentences[i])) for i in range(len(dev_sentences))])
print(w2v_tensor_train.shape)




train sentences: [['新建', '联系人', '张', '三', '电话号码', '一二三四五'], ['黄', '狼', '。'], ['把', '张', '玉', '娟', '的', '手机号码', '发送给', '吴', '伟'], ['潘', '靖', '的', '电话', '是', '多少'], ['新建', '联系人', '买', '当', '唠', '4008', ' ', '517517'], ['新建', '联系人', '天天'], ['新建', '联系人', '王', '满', '秀', '幺', '三七', ' ', '六四', '零七', '零', '二', '九五'], ['新建', '联系人', '王', '义勇', '幺', '三八', '六', '零零', '幺', '九四', '九零'], ['新建', '联系人', '孙', '晓', '飞', '幺', '三八', '三八', '四四', '三八', '九五'], ['查找', '张', '伟', '军'], ['把', '平水', '相逢', '的', '手机号码', '发给', '8000'], ['把', '小', '新', '的', '号码', '发给', '徐', '云', '蔚'], ['新建', '联系人', '振华', '奎', '号码', '是', '幺', '三五', '零', '八九', '四九', '三', '幺', '二'], ['把', '徐', '程', '的', '电话', '发给', '吴', '芳'], ['把', '哥哥', '的', '电话', '发给', '殷', '龙'], ['把', '李', '文', '鼎', '的', '号码', '发给', '谢', '服', '全'], ['把', '侨', '天', '慧', '的', '号码', '发给', '副', '卡'], ['查看', '婷婷', '的', '号码'], ['新建', '联系人', '张', '雨', '号码', '是', '五', '六七'], ['周', '慧', '的', '电话', '。'], ['把', '唐', '玉', '明', '的', '手机号码', '发给', '李', '康'], ['查看', '王老师', '的', '号码'



(2299,)




In [14]:
# pad all inputs to the same length of sentence by truncating excessive sentence or adding extra 0 vector
# for each matrix in the batch, pad the matrix by adding 0 vectors or truncating vectors in order to achieve the same length 
# since 0 is discarded by the network, we will add 0 vectors
w2v_tensor_train = sequence.pad_sequences(w2v_tensor_train, maxlen=10,padding="post",truncating="post",dtype='float32')
w2v_tensor_dev = sequence.pad_sequences(w2v_tensor_dev, maxlen=10,padding="post",truncating="post",dtype='float32')
print(w2v_tensor_train.shape)

(2299, 10, 100)


# Method 2: trainining the model with Word2Vec and Bidirectional LSTM.

The second method I tried is to resort to the help of gensim's word2vec algorithm to train the word vector using the document in order to improve the effectiveness of the word vectors.

# Network: Bidirectional LSTM
The structure of the network is still the LSTM model, except now the LSTM layer goes in forward and backward directions. This is because we want to handle cases like:
He said, "Teddy Roosevelt was the POTUS in the early 20th century."
vs
He said, "Teddy bears are fantastic."
where the word Teddy's meaning cannot be inferred without the later half of the sentence.

# Result:
We achieved 78.05% accuracy in clsssification in the Dev set. In addition, we have precision =0.7362174401838819, recall score = 0.7500266633457165, fbeta score = 0.7366286202207406. 

In [19]:
from sklearn.metrics import precision_recall_fscore_support

# embedding_vecor_length = 100
w2v_network = Sequential()
layer_num = 3
w2v_network.add(Bidirectional(LSTM(200, return_sequences=True), input_shape=(10, 100)))
w2v_network.add(Dropout(0.7))
for i in range(0, layer_num - 2):
    w2v_network.add(Bidirectional(LSTM(200, return_sequences=True)))
    w2v_network.add(Dropout(0.7))
w2v_network.add(Bidirectional(LSTM(200)))
w2v_network.add(Dropout(0.7))
w2v_network.add(Dense(num, activation='softmax'))
w2v_network.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(w2v_network.summary())
print(w2v_tensor_train.shape)
output = w2v_network.fit(w2v_tensor_train, y_train, epochs=300, batch_size=64) #TODO: edit epoch
# Final evaluation of the w2v_network
scores = w2v_network.evaluate(w2v_tensor_dev, y_dev, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))
# macro F1:
train_predict = np.argmax(w2v_network.predict(w2v_tensor_train),axis=1)
dev_predict = np.argmax(w2v_network.predict(w2v_tensor_dev),axis=1)
print(precision_recall_fscore_support(class_train, train_predict, average='macro'))
# macro F1 score for dev
print(precision_recall_fscore_support(class_dev, dev_predict, average='macro'))
print("all done")

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_12 (Bidirecti  (None, 10, 400)          481600    
 onal)                                                           
                                                                 
 dropout_12 (Dropout)        (None, 10, 400)           0         
                                                                 
 bidirectional_13 (Bidirecti  (None, 10, 400)          961600    
 onal)                                                           
                                                                 
 dropout_13 (Dropout)        (None, 10, 400)           0         
                                                                 
 bidirectional_14 (Bidirecti  (None, 400)              961600    
 onal)                                                           
                                                      

In [None]:
train_sentences = split_chinese_sentence(data_train, False)
dev_sentences = split_chinese_sentence(data_dev, False)
sentences = train_sentences + dev_sentences
print(train_sentences)
sentences_set = set()
for sentence in sentences:
  for word in sentence:
    sentences_set.add(word)
embedding = weight(sentences_set)
# print(embedding)
# count_key = 0
# for key,val in embedding.items():
#   print(key)
#   print(val)
#   count_key += 1
#   if count_key >= 100:
#     break

[['新建', '联系人', '张', '三', '电话号码', '一二三四五'], ['黄', '狼', '。'], ['把', '张', '玉', '娟', '的', '手机号码', '发送给', '吴', '伟'], ['潘', '靖', '的', '电话', '是', '多少'], ['新建', '联系人', '买', '当', '唠', '4008', ' ', '517517'], ['新建', '联系人', '天天'], ['新建', '联系人', '王', '满', '秀', '幺', '三七', ' ', '六四', '零七', '零', '二', '九五'], ['新建', '联系人', '王', '义勇', '幺', '三八', '六', '零零', '幺', '九四', '九零'], ['新建', '联系人', '孙', '晓', '飞', '幺', '三八', '三八', '四四', '三八', '九五'], ['查找', '张', '伟', '军'], ['把', '平水', '相逢', '的', '手机号码', '发给', '8000'], ['把', '小', '新', '的', '号码', '发给', '徐', '云', '蔚'], ['新建', '联系人', '振华', '奎', '号码', '是', '幺', '三五', '零', '八九', '四九', '三', '幺', '二'], ['把', '徐', '程', '的', '电话', '发给', '吴', '芳'], ['把', '哥哥', '的', '电话', '发给', '殷', '龙'], ['把', '李', '文', '鼎', '的', '号码', '发给', '谢', '服', '全'], ['把', '侨', '天', '慧', '的', '号码', '发给', '副', '卡'], ['查看', '婷婷', '的', '号码'], ['新建', '联系人', '张', '雨', '号码', '是', '五', '六七'], ['周', '慧', '的', '电话', '。'], ['把', '唐', '玉', '明', '的', '手机号码', '发给', '李', '康'], ['查看', '王老师', '的', '号码'], ['新建', '联系人', 

In [None]:
print(data_train)
print(list(data_train[0]))
# now we cut the Chinese words into Chinese characters instead of Chinese vocabularies. This is because the pretrained embedding is based on individual Chinese characters.

['新建联系人张三电话号码一二三四五', '黄狼。', '把张玉娟的手机号码发送给吴伟', '潘靖的电话是多少', '新建联系人买当唠4008 517517', '新建联系人天天', '新建联系人王满秀幺三七 六四零七零二九五', '新建联系人王义勇幺三八六零零幺九四九零', '新建联系人孙晓飞幺三八三八四四三八九五', '查找张伟军', '把平水相逢的手机号码发给8000', '把小新的号码发给徐云蔚', '新建联系人振华奎号码是幺三五零八九四九三幺二', '把徐程的电话发给吴芳', '把哥哥的电话发给殷龙', '把李文鼎的号码发给谢服全', '把侨天慧的号码发给副卡', '查看婷婷的号码', '新建联系人张雨号码是五六七', '周慧的电话。', '把唐玉明的手机号码发给李康', '查看王老师的号码', '新建联系人18622625490', '把李志强的号码发给贾洪鉴', '新建联系人姓名张三号码一二三四五六七八', '查寻标兵的号码', '把许琦彪的号码发给徐鹏', '添加一条通讯录姓名葛勇手机号码189571642496', '把王世怀的号码发给乔丽君', '把张新涛电话发给常胜宾', '到太湖怎么走', '去常州武进路线', '湖里公园在哪里', '带我去丹阳市眼镜市场', '从南京到亳州怎么走', '上海到苏州', '美国大使馆在哪', '华晶小学在哪里', '去科大讯飞怎么走', '导航到佛山大良汽车站', '松江钢材城在哪里', '我想到科大讯飞怎么走', '去湘江', '合肥百货大楼在哪里', '往火车北站怎么走', '辽宁大连在哪里', '到步行街不走高架的路线是哪条', '到鼓楼怎么走', '合肥的大润发在哪', '怎么去泉城广场', '到合肥市逍遥津公园怎么走', '沿途有没有加油站', '到环都大酒店怎么走', '到公园怎么走', '我要去姜堰', '六安市飞云卫生院在哪里', '科大讯飞在哪里呀', '从这里到市中心怎么走', '查询北京到上海的路线', '讯飞语点我现在在哪里', '春熙路怎么走', '怎么去龙阳路', '到郑州火车站', '宁波机场在哪里', '从二手车市场到八一广场怎么走', '帮我查一下合肥市三孝口在哪里', '我所在地理位置', '帮我搜一下科大讯飞在哪', '查询沿途的KTV', '郑州火车站怎么走', '鹰潭

In [None]:
print(embedding['新'])
print(embedding['建'])
print(embedding['新建'])
print(embedding['联系人'])
print(embedding['人'])
print(embedding['黄'])
print(embedding['张'])
print(embedding['三'])
# print(embedding['张三'])
print(train_sentences[0])
print(embedding['电话号码'])
print(embedding['一二三四五'])
print(embedding['我'])
print(embedding['查'])
print(list(jieba.cut('帮我查一下明天去郑州的火车票')))
print(split_chinese_sentence(['联系人'],False))
print(list(jieba.cut_for_search('帮我查一下明天去郑州的火车票')))
jieba.suggest_freq(('帮', '我','查','一下'), True)
jieba.suggest_freq(('帮','我查'), False)
# jieba.suggest_freq(('查','一下'), True)
# jieba.suggest_freq(('查一下'), True)
print(list(jieba.cut('帮我查一下明天去郑州的火车票', HMM=False)))
print(list(jieba.cut('帮我查一下明天去郑州的火车票', HMM=False)))
print(list(jieba.cut('黄狼。', HMM=False)))
# print(list(jieba.cut('黄狼。')))

['-0.298940', '-0.329376', '0.022706', '0.520460', '0.280783', '0.315121', '0.028636', '-0.101839', '-0.104746', '-0.113371', '0.017259', '0.382496', '-0.351138', '0.645342', '0.477147', '-0.023872', '0.140370', '-0.221648', '-0.348404', '0.495963', '-0.432443', '-0.585693', '0.324903', '-0.281049', '0.176148', '0.486921', '0.353200', '0.010373', '-0.153379', '-0.071657', '0.238785', '0.088374', '0.685110', '-0.269010', '-0.669316', '0.748042', '0.150539', '-0.258463', '0.501294', '0.277768', '-0.375714', '-1.065189', '-0.042222', '0.071375', '-0.118062', '-0.406417', '0.015082', '0.314712', '0.366007', '0.406628', '-0.197618', '-0.335666', '-0.154915', '-0.474525', '0.073796', '0.032941', '-0.427259', '-0.383036', '0.098565', '-0.017135', '0.786840', '0.249935', '0.001415', '0.419503', '-0.069695', '0.618005', '-0.143879', '-0.237218', '-0.473796', '0.119042', '0.051967', '-0.026337', '0.499829', '-0.239778', '0.521437', '-0.184927', '-0.155991', '0.146076', '0.081031', '-0.277186', '

In [None]:

max_sentences_num = 0
# TODO feature: given variable length dev/validiation, truncate the dev/validation input dimension to fit the network's dimension
# the current preprocessing is done manually, which is not optimal
for sentence in train_sentences:
    max_sentences_num = max(max_sentences_num, len(sentence))

for sentence in dev_sentences:
    max_sentences_num = max(max_sentences_num, len(sentence))


sentences = train_sentences + dev_sentences
# print(sentences)
# w2v_model = gensim.models.Word2Vec(sentences,  min_count=1)
# w2v_weights = w2v_model.wv.vectors

# w2v_tensor_train = np.array([list(map(w2v_model.wv.get_vector, train_sentences[i])) for i in range(len(train_sentences))])
# w2v_tensor_dev = np.array([list(map(w2v_model.wv.get_vector, dev_sentences[i])) for i in range(len(dev_sentences))])


arr_temp = []
for i in range(len(train_sentences)):
  temp_list = []
  for word in train_sentences[i]:
      if word in embedding.keys():
        temp_list.append(embedding[word])
      # else:
        # print(train_sentences[i], word)
        # temp_list.append(embedding[word])
  arr_temp.append(temp_list)

# arr_test = [embedding[word] for word in train_sentences[i] for i in range(len(train_sentences))]
pretrained_tensor_train = np.array(arr_temp)


arr_temp = []
for i in range(len(dev_sentences)):
  temp_list = []
  for word in dev_sentences[i]:
      if word in embedding.keys():
        temp_list.append(embedding[word])
      # else:
        # print(train_sentences[i], word)
        # temp_list.append(embedding[word])
  arr_temp.append(temp_list)

# arr_test = [embedding[word] for word in train_sentences[i] for i in range(len(train_sentences))]
pretrained_tensor_dev = np.array(arr_temp)

# pretrained_tensor_train = np.array([[embedding[word] for word in train_sentences[i]] for i in range(len(train_sentences))])
# pretrained_tensor_dev = np.array([[embedding[word] for word in dev_sentences[i]] for i in range(len(dev_sentences))])
print(pretrained_tensor_train.shape)
pretrained_tensor_train = sequence.pad_sequences(pretrained_tensor_train, maxlen=10,padding="post",truncating="post",dtype='float32')
pretrained_tensor_dev = sequence.pad_sequences(pretrained_tensor_dev, maxlen=10,padding="post",truncating="post",dtype='float32')

print(pretrained_tensor_train.shape)


# print(w2v_tensor_train)
# for each matrix in the batch, pad the matrix by adding 0 vectors or truncating vectors in order to achieve the same length 
# since 0 is discarded by the network, we will add 0 vectors




(2299,)
(2299, 10, 300)


# Method 3: trainining the model with pretrained Chines word vector

Using the word vector generated by using gensim word2vec model, it seems like the accuracy decreases by 2-3%. I believed that the way that this may be due to the fact that the current size of the data is too small, so I decided to use pretrained embeddings from https://github.com/Embedding/Chinese-Word-Vectors.

# Network: Bidirectional LSTM
The structure of the network is still the LSTM model, except now the LSTM layer goes in forward and backward directions. This is because we want to handle cases like:
He said, "Teddy Roosevelt was the POTUS in the early 20th century."
vs
He said, "Teddy bears are fantastic."
where the word Teddy's meaning cannot be inferred without the later half of the sentence.

# Result:
Result: We achieved 93.25% accuracy in clsssification in the Dev set. In addition, we have (precision = 0.9190465450079812, recall score = 0.9300608772558282, fbeta score = 0.9216210986471332)
 

In [None]:
# embedding_vecor_length = 100
pretrained_network = Sequential()
layer_num = 3
# w2v_network.add(Dense(max_sentences_num, input_shape=(46, 100)))

# w2v_network.add(keras.layers.Flatten(input_shape=(46, 100)))
# w2v_network.add(Dense(w2v_weights.shape[1],activation='relu') )
# w2v_network.add(Embedding(input_dim=w2v_weights.shape[0],
#                     output_dim=w2v_weights.shape[1],
#                     weights=[w2v_weights],
#                     input_length=max_sentences_num,
#                     mask_zero=True,
#                     trainable=True))
pretrained_network.add(Bidirectional(LSTM(200, return_sequences=True), input_shape=(10, 300)))
pretrained_network.add(Dropout(0.9))
for i in range(0, layer_num - 2):
    pretrained_network.add(Bidirectional(LSTM(200, return_sequences=True)))
    pretrained_network.add(Dropout(0.9))
pretrained_network.add(Bidirectional(LSTM(200)))

# w2v_network.add(Bidirectional(GRU(128),input_shape=(46,100)))
# w2v_network.add(Embedding(2000, 1, input_length=max_sentences_num))
# print(w2v_weights.shape)
# print(w2v_tensor_train.shape)

# w2v_network.add(Dense(0, activation='relu'))
# w2v_network.add(Bidirectional(LSTM(128)))

# w2v_network.add(Dense(128, activation='relu'))
pretrained_network.add(Dropout(0.9))
# w2v_network.add(keras.layers.Flatten())
# w2v_network.add(Dense(w2v_weights.shape[1],activation='relu') )
pretrained_network.add(Dense(num, activation='softmax'))
pretrained_network.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# w2v_network.build(input_shape=(46,100))
print(pretrained_network.summary())
print(pretrained_tensor_train.shape)
output = pretrained_network.fit(pretrained_tensor_train, y_train, epochs=100, batch_size=64)
# Final evaluation of the w2v_network
scores = pretrained_network.evaluate(pretrained_tensor_dev, y_dev, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))
# macro F1:
train_predict = np.argmax(pretrained_network.predict(pretrained_tensor_train),axis=1)
dev_predict = np.argmax(pretrained_network.predict(pretrained_tensor_dev),axis=1)
print(precision_recall_fscore_support(class_train, train_predict, average='macro'))
# macro F1 score for dev
precision_recall_fscore_support(class_dev, dev_predict, average='macro')
print("all done")

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_24 (Bidirecti  (None, 10, 400)          801600    
 onal)                                                           
                                                                 
 dropout_24 (Dropout)        (None, 10, 400)           0         
                                                                 
 bidirectional_25 (Bidirecti  (None, 10, 400)          961600    
 onal)                                                           
                                                                 
 dropout_25 (Dropout)        (None, 10, 400)           0         
                                                                 
 bidirectional_26 (Bidirecti  (None, 400)              961600    
 onal)                                                           
                                                     

In [None]:
# train 
print(precision_recall_fscore_support(class_train, train_predict, average='macro'))
# dev macro F1
print(precision_recall_fscore_support(class_dev, dev_predict, average='macro'))

(0.9995185363505056, 0.9999291031549096, 0.9997219707418231, None)
(0.9190465450079812, 0.9300608772558282, 0.9216210986471332, None)


总结：于dev set准确率达到93.25%，同时取得 (精确率 = 0.9190465450079812, 召回率 = 0.9300608772558282, F1值 = 0.9216210986471332)的评分 

Result: We achieved 93.25% accuracy in clsssification in the Dev set. In addition, we have (precision = 0.9190465450079812, recall score = 0.9300608772558282, fbeta score = 0.9216210986471332)

# Summary
In our code we have adopted three different word vector generators and two different networks (the later networks are advanced version of the first version) to the problem. The first and the second models perform relatively bad due to the word vector's effectiveness as there are not too many input samples to learn while Chinese characters have vastly different meanings. The  model with pretrained word vectors performs a lot better because it is trained based on ngram2vec, which is the superset of word2vec and fasttext, and it has access to Weibo and Baidubaike, which are two popular Chinese version of Twitter and Wikipedia. 



Reference:
Shen Li, Zhe Zhao, Renfen Hu, Wensi Li, Tao Liu, Xiaoyong Du, Analogical Reasoning on Chinese Morphological and Semantic Relations, ACL 2018.

