# TensorFlow练习1： 对评论进行分类

## 1. 引入包文件


In [79]:
#coding: utf8

import numpy as np
import tensorflow as tf
import random
import pickle
from collections import Counter

import nltk

from nltk.tokenize import word_tokenize

# 词形还原(lemmatizer)，即把一个任何形式的英语单词还原到一般形式，与词根还原不同(stemmer)，后者是抽取一个单词的词根。 
from nltk.stem import WordNetLemmatizer

## 构建词汇词典
读取文件，对评论进行词性还原等操作，然后每个词语出现的次数，挑选出出现次数少于2000但大于20的词语构建成词典

In [89]:
pos_file = 'pos.txt'
neg_file = 'neg.txt'

# 读取文件，进行分词，
def process_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        # print(lines)
        lex = []
        for line in lines:
#             print(line)
            words = word_tokenize(line.lower())
            lex += words
        return lex
#         print(lex)
# res = process_file(pos_file)
# print(res)

# 創建词汇表
def create_lexicon(pos_file, neg_file):
    lex = []
    
    lex += process_file(pos_file)
    lex += process_file(neg_file)
    #  print(len(lex))
    lemmarizer = WordNetLemmatizer()
    # 词形还原
    lex = [lemmarizer.lemmatize(word) for word in lex]
    
    word_count = Counter(lex)
    # print(word_count)
    lex = []
    for word in word_count:
        if word_count[word] < 2000 and word_count[word] > 20:
            lex.append(word)
        
    return lex

lex = create_lexicon(pos_file, neg_file)
print(len(lex))


1065


## 将评论转换成矩阵表示
将评论中的词出现在词典中构建成标记为1， 未出现的词标记为0
```
把每条评论转换为向量, 转换原理：
假设lex为['woman', 'great', 'feel', 'actually', 'looking', 'latest', 'seen', 'is'] 当然实际上要大的多
评论'i think this movie is great' 转换为 [0,1,0,0,0,0,0,1], 把评论中出现的字在lex中标记，出现过的标记为1，其余标记为0
 ```

In [81]:
# 把每条评论转换为向量, 转换原理：
# 假设lex为['woman', 'great', 'feel', 'actually', 'looking', 'latest', 'seen', 'is'] 当然实际上要大的多
# 评论'i think this movie is great' 转换为 [0,1,0,0,0,0,0,1], 把评论中出现的字在lex中标记，出现过的标记为1，其余标记为0

# lex:词汇表， review:评论 clf：评论对应的分类 [0,1]代表负面评价, [1,0]代表正面评价
def string_to_vector(lex, review, clf):
    words = word_tokenize(review.lower())
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    features = np.zeros(len(lex))
    for word in words:
        if word in lex:
            features[lex.index(word)] = 1
    return [features, clf]

# clf = [0, 1]
# lex = ["he", "have", "a", "dog"]
# review = "She have a dog"
# res = string_to_vector(lex, review, clf)
# print(res)
# lex李保存了文本中出现过的单词
def normalize_dataset(lex):
    dataset = []
    
    with open(pos_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            one_sample = string_to_vector(lex, line, [1, 0])
            dataset.append(one_sample)
    
    with open(neg_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            one_sample = string_to_vector(lex, line, [0, 1])
            dataset.append(one_sample)
    return dataset
    
dataset = normalize_dataset(lex)
print(len(dataset))
print(dataset[0])
random.shuffle(dataset)


# with open('save.pickle', 'wb') as f:
#     pickle.dump(dataset, f)
        

10662
[array([1., 1., 1., ..., 0., 0., 0.]), [1, 0]]


In [82]:
# 取样本中的10%作为测试数据
test_size = int(len(dataset) * 0.1)

dataset = np.array(dataset)

train_dataset = dataset[:-test_size]
test_dataset = dataset[-test_size:]

## 构建神经网络，本文中使用的两层隐藏层，参数如下：

In [88]:
# 输出层
n_input_layer = len(lex)

# 隐藏层
n_layer_1 = 1000
n_layer_2 = 1000

# 输出层
n_output_layer = 2

print([n_input_layer, n_layer_1, n_layer_2, n_output_layer])


[1065, 1000, 1000, 2]


In [84]:
def neural_network(data):
    # 定义第一层“神经元”的权重和biases
    layer_1_w_b = {'w_': tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_': tf.Variable(tf.random_normal([n_layer_1]))}
    # 定义第二层“神经元”的权重和biases
    layer_2_w_b = {'w_': tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_': tf.Variable(tf.random_normal([n_layer_2]))}
    # 定义输出层“神经元”的权重和biases
    layer_output_w_b = {'w_': tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_': tf.Variable(tf.random_normal([n_output_layer]))}
    
    layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
    layer_1 = tf.nn.relu(layer_1)
    layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
    layer_2 = tf.nn.relu(layer_2)
    layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
    
    return layer_output

In [85]:
# 每次使用50条数据进行训练
batch_size = 50

X = tf.placeholder('float', [None, len(train_dataset[0][0])])
# [None, len(train_x)]代表数据的高和宽（矩阵）， 好处是如果数据不符合宽高，tensorflow会报错，不指定也可以

Y = tf.placeholder('float')

In [86]:
# 使用数据训练神经网络
def train_neural_network(X, Y):
    predict = neural_network(X)
    cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predict, labels=Y))
#     optimizer = tf.train.AdamOptimizer().miniminze(cost_func)    # learning_rate 默认为0.001
    optimizer = tf.train.AdamOptimizer().minimize(cost_func)  # learning rate 默认 0.001 
    
    epochs = 13
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        random.shuffle(train_dataset)
        train_x = train_dataset[:, 0]
        train_y = train_dataset[:, 1]
        for epoch in range(epochs):
            i = 0
            epoch_loss = 0
            while i < len(train_x):
                start = i
                end = i + batch_size
                
                batch_x = train_x[start: end]
                batch_y = train_y[start: end]
                
                _, c = session.run([optimizer, cost_func], feed_dict = {X: list(batch_x), Y:list(batch_y)})
                epoch_loss += c
                i += batch_size
            print(epoch, ": ", epoch_loss)
        
        test_x = test_dataset[:, 0]
        test_y = test_dataset[:, 1]
        correct = tf.equal(tf.argmax(predict, 1), tf.argmax(Y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print("accuracy: ", accuracy.eval({X:list(test_x), Y: list(test_y)}))
train_neural_network(X, Y)

0 :  60904.511962890625
1 :  13923.341103553772
2 :  5385.677576478862
3 :  2836.649026087969
4 :  2033.6653112982322
5 :  1596.7483860661418
6 :  1084.8012358154842
7 :  583.6081699967997
8 :  280.7399051677852
9 :  146.3240720726634
10 :  198.2349517698661
11 :  92.6367470573328
12 :  152.57086673134296
accuracy:  0.6097561


In [91]:
from collections import Counter
a = Counter('abbcccddd')
print(a)
print(type(a))
for i in a.elements():
    print(i)
print(a['c'])

Counter({'c': 3, 'd': 3, 'b': 2, 'a': 1})
<class 'collections.Counter'>
a
b
b
c
c
c
d
d
d
3
