# 文本表示模型：TF-IDF
# 分类模型：DNN

# 1.导入工具包

In [12]:
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import tensorflow.keras as K

# 2.设置数据集路径

In [None]:
stop_word_path = '../../data/day10-nlp-data/chineseStopWords.txt'
neg_data_path = '../../data/day10-nlp-data/film_review/neg.xlsx'
pos_data_path = '../../data/day10-nlp-data/film_review/pos.xlsx'

# 3.加载数据集

In [None]:
neg=pd.read_excel(neg_data_path,index=None,header = None)
pos=pd.read_excel(pos_data_path,index=None,header = None)

# 4.数据连接

In [2]:
comment=np.concatenate((pos[0], neg[0]))
y = np.concatenate((np.ones(len(pos),dtype=int), np.zeros(len(neg),dtype=int)))
comment = comment.astype(str).tolist()

# 5.分词

In [3]:
def chinese_word_cut(text):
    """
    jieba分词
    """
    text = [" ".join(jieba.cut(document)) for document in text]
    return text

In [4]:
comment =chinese_word_cut(comment)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.213 seconds.
Prefix dict has been built succesfully.


# 6.去停用词

In [5]:
def get_stopword_list():
    """
    构建停用词列表
    """
    stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path, encoding='gb18030').readlines()]
    return stopword_list
stopword_list = get_stopword_list()

In [6]:
def remove_stopwords(text):
    """
    去停用词
    """
    mytext = ''
    text= text.split()
    for i,j in enumerate(text):  
        if j not in stopword_list:
            mytext+=j
            if i!=len(text)-1:
                mytext+=' '
    return mytext

In [7]:
comment = [remove_stopwords(text) for text in comment]

# 7.划分数据集

In [8]:
X_train, X_test, y_train, y_test = train_test_split(comment, y, test_size=0.3,random_state = 1)

# 8.TF-IDF模型构建

In [9]:
tf_idf = TfidfVectorizer()
X_new_train = tf_idf.fit_transform(X_train)
X_new_test = tf_idf.transform(X_test)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [10]:
X_new_train.shape

(7707, 12696)

# 9.TF-IDF+DNN模型构建与检验

In [11]:
"""
1.初始化：
    glorot_uniform均匀分布初始化
    优化函数Adam
"""

init = K.initializers.glorot_uniform(seed=2)
simple_adam = K.optimizers.Adam()

"""
2.定义DNN模型：
    搭建的DNN由输入层、隐藏层、输出层和sigmoid函数组成，其中输入层由训练集的维度决定，隐藏层有一层，
    包括32个神经元，之后就是输出层，由1个神经元组成，最后，sigmoid，用于解决二分类问题而创建。
"""
model = K.models.Sequential()
model.add(K.layers.Dense(units=32, input_dim=X_new_train.shape[1], kernel_initializer=init, activation='sigmoid'))
model.add(K.layers.Dense(units=1, kernel_initializer=init, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=simple_adam, metrics=['accuracy'])

"""
3.训练模型fit
"""
h = model.fit(X_new_train, y_train, batch_size=100, epochs=5, shuffle=True, verbose=1)

"""
4.检验模型evaluate
"""
eval = model.evaluate(X_new_test, y_test, verbose=0)
print("Evaluation on test data: loss = %0.6f accuracy = %0.2f%% \n" \
          % (eval[0], eval[1] * 100))

Starting training 
Train on 7707 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training finished 

Evaluation on test data: loss = 0.578521 accuracy = 85.98% 

