# word2vec_RNN

# 1. 导入工具包

In [None]:
import os
import csv
import time
import datetime
import random
import json
from collections import Counter
from math import sqrt
import gensim
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed,Input,Conv2D,MaxPool2D,SpatialDropout1D,concatenate,Flatten,Dense,Dropout,Embedding,SimpleRNN,Reshape,GRU,LSTM
from tensorflow.keras import Sequential,optimizers,losses
from tensorflow.keras.models import Model,model_from_yaml
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import regularizers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from gensim.corpora import Dictionary
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
import multiprocessing
import yaml
import jieba

# 2.数据存储地址，及参数设置

In [None]:
stop_word_path = '../../data/day10-nlp-data/chineseStopWords.txt'
neg_data_path = '../../data/day10-nlp-data/film_review/neg.xlsx'
pos_data_path = '../../data/day10-nlp-data/film_review/pos.xlsx'

vocab_dim = 60
maxlen = 50
n_iterations = 1  
n_exposures = 10
window_size = 7
batch_size = 100
n_epoch = 10
input_length = 50
cpu_count = multiprocessing.cpu_count()

# 3.读取数据集

In [None]:
neg=pd.read_excel(neg_data_path,index=None,header = None)
pos=pd.read_excel(pos_data_path,index=None,header = None)

## 3.1 查看消极评论

In [None]:
neg.head()

##  3.2查看积极评论

In [None]:
pos.head()

# 4.数据连接

In [None]:
comment=np.concatenate((pos[0], neg[0]))
y = np.concatenate((np.ones(len(pos),dtype=int), np.zeros(len(neg),dtype=int)))
comment = comment.astype(str).tolist()

In [None]:
comment[:10]

# 5.分词

In [None]:
def chinese_word_cut(text):
    """
    jieba分词
    """
    text = [" ".join(jieba.cut(document)) for document in text]
    return text
comment =chinese_word_cut(comment)
comment[:10]

# 6.去停用词

In [None]:
def get_stopword_list():
    """
    构建停用词列表
    """
    stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path, encoding='gb18030').readlines()]
    return stopword_list
stopword_list = get_stopword_list()

In [None]:
def remove_stopwords(text):
    """
    去停用词
    """
    mytext = ''
    text= text.split()
    for i,j in enumerate(text):  
        if j not in stopword_list:
            mytext+=j
            if i!=len(text)-1:
                mytext+=' '
    return mytext

In [None]:
comment = [remove_stopwords(text) for text in comment]
comment[:10]

# 7.构建词向量

In [None]:
#创建词典映射
def create_dictionaries(model=None,comment=None):
    """
    建立词语-向量映射表,
    model:词向量模型
    comment:预处理后的数据集
    
    返回值：
    w2indx 词典（格式为：索引+词语）
    w2vec  词语对应的向量
    comment  向量化后的数据集（词语-索引序号）
    
    """
    if (comment is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量


        def parse_dataset(comment):
            ''' 
               Words become integers
            '''
            data=[]
            for sentence in comment:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        comment=parse_dataset(comment)
        #每个句子所含词语对应的索引，所以句子中含有频数小于10的词语，索引为0；maxlen，每个句子最大长度
        comment= sequence.pad_sequences(comment, maxlen=maxlen)
        return w2indx, w2vec,comment
    else:
        print('No data provided...')

In [None]:
"""
1.加载word2vec模型
"""
model = Word2Vec.load('model/Word60.model') 

"""
2.创建词语字典，并返回每个词语的索引，词向量，以及每个句子所对应的词语索引
"""
index_dict, word_vectors,comment = create_dictionaries(model=model,comment=comment)

In [None]:
index_dict

In [None]:
word_vectors

In [None]:
comment[3]

In [None]:
"""
1.n_symbols:所有单词的索引数，频数小于10的词语索引为0 ,所以加1
"""
n_symbols = len(index_dict) + 1 


"""
2. embedding_weights:索引为0的词语（频数小于10），词向量全为0
"""
embedding_weights = np.zeros((n_symbols, vocab_dim))


"""
3.从索引为1的词语开始，对每个词语对应其词向量
"""
for word, index in index_dict.items():
    embedding_weights[index, :] = word_vectors[word]

# 8. 划分数据集

In [None]:
"""
train_test_split函数用于将矩阵随机划分为训练子集和测试子集，并返回划分好的训练集测试集样本和训练集测试集标签。
"""
x_train, x_test, y_train, y_test = train_test_split(comment, y, test_size=0.2)
print(x_train.shape,y_train.shape)

In [None]:
x_train

In [None]:
y_train

# 9.RNN创建模型

In [None]:
"""
1.定义RNN网络，
    Embedding层:
        input_dim：大或等于0的整数，字典长度，即输入数据最大下标+1
        output_dim：大于0的整数，代表全连接嵌入的维度
        input_length：当输入序列的长度固定时，该值为其长度。如果要在该层后接Flatten层，然后接Dense层，则必须指定该参数，
                      否则Dense层的输出维度无法自动推断。
        weights：可以通过weights参数指定初始的weights参数，weights是一个列表
    因为Embedding层是不可导的，所以把embedding放在中间层是没有意义的,emebedding只能作为第一层
    
    SimpleRNN层：RNN在Keras中对应SimpleRNN层，
        activation:激活函数
        units:神经元个数
    Dropout层：用于防止过拟合，参数可以调整
    Dense层：
        units: 该层的神经单元结点数。 
        activation: 激活函数.          
"""
model = Sequential([Embedding(input_dim=n_symbols,output_dim=vocab_dim,
                        weights=[embedding_weights],input_length=input_length),
                    SimpleRNN(100,activation='relu'),
                    Dropout(0.5),
                    Dense(25,activation='relu'),
                    Dense(1,activation='sigmoid')])


"""
2.compile模型训练的BP模式设置：  
    loss： 字符串（预定义损失函数名）或损失函数
    optimizer： 字符串（预定义优化器名）或优化器对象
    metrics： 列表，包含评估模型在训练和测试时的网络性能的指标，典型用法是metrics=[‘accuracy’]

"""
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




"""
3.model.summary()输出模型各层的参数状况
"""
model.summary()

In [None]:
# LSTM 网络结构
LSTM_model =Sequential([
        Embedding(input_dim=n_symbols,output_dim=vocab_dim,
                        weights=[embedding_weights],
                        input_length=input_length), 
LSTM(100,activation='relu'),
Dropout(0.5),
Dense(25, activation='relu'),
Dense(1, activation='sigmoid')])
LSTM_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
LSTM_model.summary()

# 10.训练模型

In [None]:
"""
model.fit:
fit函数返回一个History的对象，其History.history属性记录了损失函数和其他指标的数值随epoch变化的情况，如果有验证集的话，也包含了验证集的这些指标变化情况
    x：输入数据。如果模型只有一个输入，那么x的类型是numpy array，如果模型有多个输入，那么x的类型应当为list，list的元素是对应于各个输入的numpy array
    y：标签，numpy array
    batch_size：整数，指定进行梯度下降时每个batch包含的样本数。训练时一个batch的样本会被计算一次梯度下降，使目标函数优化一步。
    epochs：整数，训练终止时的epoch值，训练将在达到该epoch值时停止，当没有设置initial_epoch时，它就是训练的总轮数，否则训练的总轮数为epochs - inital_epoch
    verbose：日志显示，0为不在标准输出流输出日志信息，1为输出进度条记录，2为每个epoch输出一行记录
    shuffle：布尔值或字符串，一般为布尔值，表示是否在训练过程中随机打乱输入样本的顺序。若为字符串“batch”，则是用来处理HDF5数据的特殊情况，它将在batch内部将数据打乱。

"""
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch, validation_split=0.2,shuffle=True,callbacks=[reduce_lr,early_stopping,model_checkpoint])



y_pre = model.predict_classes(x_test)

In [None]:
# reduce_lr_lstm = ReduceLROnPlateau(monitor='val_loss', patience=10, mode='auto')
# early_stopping_lstm = EarlyStopping(monitor='val_loss', patience=5)
# model_checkpoint_lstm = ModelCheckpoint('./model/LSTM/bestmodel/model_{epoch:02d}-{val_accuracy:.2f}.hdf5', save_best_only=True, save_weights_only=True)
# lstm_history = LSTM_model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch, validation_split=0.2,shuffle=True,callbacks=[reduce_lr_lstm,early_stopping_lstm,model_checkpoint_lstm])
# #验证
# lstm_y_pre = LSTM_model.predict_classes(x_test)

# 11. 评估与保存模型

In [None]:

"""

评估模型
 model.evaluate（）返回的是 损失值和你选定的指标值
 
"""

scores = model.evaluate(x_test, y_test,verbose=0)
print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))

# lstm_scores = LSTM_model.evaluate(x_test, y_test,verbose=0)

In [None]:
"""

保存模型
  1. model.to_yaml() 以yaml格式保存模型结果（不含模型权重）
  
"""

yaml_string = model.to_yaml()
with open('./model/RNN/comment_bestmodel/rnn.yml', 'w') as outfile:
    outfile.write( yaml.dump(yaml_string, default_flow_style=True))

"""
  2.save_weights()保存的模型结果，它只保存了模型的参数，但并没有保存模型的图结构
"""    

model.save_weights('./model/RNN/comment_bestmodel/rnn.h5')


In [None]:
# yaml_string1 = LSTM_model.to_yaml()
# with open('./model/LSTM/comment_bestmodel/lstm.yml', 'w') as outfile:
#     outfile.write( yaml.dump(yaml_string1, default_flow_style=True))
# LSTM_model.save_weights('./model/LSTM/comment_bestmodel/lstm.h5')
# print('test_loss: %f, accuracy: %f' % (lstm_scores[0], lstm_scores[1]))

# 12.检验模型

In [None]:
string=['非常感动，非常好看','不太好看','看完之后很感动','一般般吧']

"""
  加载模型
"""
with open('./model/RNN/comment_bestmodel/rnn.yml', 'r') as f:
    yaml_string = yaml.load(f)
model = model_from_yaml(yaml_string)
model.load_weights('./model/RNN/comment_bestmodel/rnn.h5')

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
"""
  预处理
"""
test_string=chinese_word_cut(string)
test_string = [remove_stopwords(text) for text in test_string]
test_string=np.array(test_string)
test_string.reshape(1,-1)

In [None]:
"""
   word2vec模型加载
"""
w2v_model=Word2Vec.load('model/Word60.model')
_,_,test_string=create_dictionaries(w2v_model,test_string)

In [None]:
"""
  model.predict_classes预测的是类别，打印出来的值就是类别号
"""
result=model.predict_classes(test_string)
for i in range(len(string)):
    print(string[i],":",result[i][0])