In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import jieba
import gensim
import re
import tensorflow as tf

### 预处理数据

In [94]:
def read_data(file_name,is_train=True):
    data_df = pd.read_csv(file_name,encoding='gbk')
    if is_train:
        data_df.columns = ['names','price','catgory']
        data_df['catgory'] = data_df.catgory - 100
    else:
        data_df.columns = ['names','price']
    data_df['length'] = data_df.names.apply(lambda x:len(x))
    return data_df

# def cut_seq(x):

train_df = read_data('./train.csv')
test_df = read_data('./test.csv',is_train=False)

stop_word_df = pd.read_csv('./stopwords.txt',sep='\t',quoting=3,index_col=False,names=['word'])
stop_word_df = stop_word_df['word'].values.tolist()

train_df['cutted'] = train_df.names.apply( lambda x: jieba.lcut(x) )
train_df['cutted'] = train_df.cutted.apply(lambda x: " ".join([w for w in x if x not in stop_word_df]))

test_df['cutted'] = test_df.names.apply( lambda x: jieba.lcut(x) )
test_df['cutted'] = test_df.cutted.apply(lambda x: " ".join([w for w in x if x not in stop_word_df]))

In [96]:
train_df.sort_values(by='length',ascending=False).head()
# test_df.sort_values(by='price',ascending=False).head()

Unnamed: 0,names,price,catgory,length,cutted
5224,柠檬宝宝LEMONKID冬季室内加厚防滑卡通包跟软底宝宝居家鞋男女童棉拖鞋LE061016粉...,77,4,50,柠檬 宝宝 LEMONKID 冬季 室内 加厚 防滑 卡通 包 跟 软底 宝宝 居家 鞋 男...
8198,小蚁智能摄像机夜视版升级1080PWIFI网络摄像头监控摄像头智能家居支持小米路由WIFI本地存储,76,9,49,小蚁 智能 摄像机 夜视 版 升级 1080PWIFI 网络 摄像头 监控 摄像头 智能家居...
2,阿波罗之梦AD162玫瑰花爱心杯子套装送女友老婆女生生日礼物实用闺蜜妈妈情人节创意礼物表白结婚纪,76,6,48,阿波罗 之梦 AD162 玫瑰花 爱心 杯子 套装 送 女友 老婆 女生 生日礼物 实用 闺...
7094,速比涛SPEEDO新品全新设计TPR柔软游泳耳塞防水舒适防水导音游泳耳塞游泳装备配件夜深黑,76,9,45,速比 涛 SPEEDO 新品 全新 设计 TPR 柔软 游泳 耳塞 防水 舒适 防水 导音 ...
5408,欧时纳JUSTSTAR女包新款女士包包手提包韩版单肩斜挎包休闲百搭潮流小方包JS149,76,4,43,欧时纳 JUSTSTAR 女包 新款 女士 包包 手提包 韩版 单肩 斜挎包 休闲 百搭 潮...


## 把数据提取为list

In [86]:
from sklearn.model_selection import train_test_split
# 测试集中的文字及y标签
X = [i for i in train_df.cutted]
Y = [i for i in train_df.catgory]
# 测试集中的文字
test_text = [i for i in train_df.cutted]

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=10)

## 词袋模型

In [87]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(
    analyzer='word',
    max_features=4000
)
vec.fit(X+test_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=4000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## 测试

In [88]:
from sklearn.naive_bayes import MultinomialNB
import sklearn

classifier = MultinomialNB()
classifier.fit(vec.transform(X_train),y_train)
print(classifier.score(vec.transform(X_test),y_test))
print(classifier.predict_proba(vec.transform([X_test[0]])))

0.696
[[3.14865395e-03 2.56072336e-04 3.19937770e-02 2.91143675e-02
  9.45331389e-03 6.19073706e-01 3.71596771e-02 9.11693847e-02
  1.78117879e-01 5.13167939e-04]]


In [89]:
help(classifier.predict)

Help on method predict in module sklearn.naive_bayes:

predict(X) method of sklearn.naive_bayes.MultinomialNB instance
    Perform classification on an array of test vectors X.
    
    Parameters
    ----------
    X : array-like, shape = [n_samples, n_features]
    
    Returns
    -------
    C : array, shape = [n_samples]
        Predicted target values for X



In [90]:
from sklearn.svm import SVC

classifier = SVC(kernel='linear')
classifier.fit(vec.transform(X_train),y_train)
print(classifier.score(vec.transform(X_test),y_test))

0.655


## 该函数可以输出属于每个类的概率

In [91]:
print(vec.transform([X_test[0]]))

  (0, 3380)	1
  (0, 3475)	1


## 卷积神经网络

In [97]:
"""
基于卷积神经网络的中文文本分类
"""

import argparse
import sys

import numpy as np
import pandas
from sklearn import metrics
import tensorflow as tf

learn = tf.contrib.learn

FLAGS = None

#文档最长长度
MAX_DOCUMENT_LENGTH = 100
#最小词频数
MIN_WORD_FREQUENCE = 2
#词嵌入的维度
EMBEDDING_SIZE = 20
#filter个数
N_FILTERS = 10
#感知野大小
WINDOW_SIZE = 20
#filter的形状
FILTER_SHAPE1 = [WINDOW_SIZE, EMBEDDING_SIZE]
FILTER_SHAPE2 = [WINDOW_SIZE, N_FILTERS]
#池化
POOLING_WINDOW = 4
POOLING_STRIDE = 2
n_words = 0


def cnn_model(features, target):
	"""
    2层的卷积神经网络，用于短文本分类
    """
	# 先把词转成词嵌入
	# 我们得到一个形状为[n_words, EMBEDDING_SIZE]的词表映射矩阵
	# 接着我们可以把一批文本映射成[batch_size, sequence_length, EMBEDDING_SIZE]的矩阵形式
	target = tf.one_hot(target, 15, 1, 0)
	word_vectors = tf.contrib.layers.embed_sequence(
			features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words')
	word_vectors = tf.expand_dims(word_vectors, 1)
	with tf.variable_scope('CNN_Layer1'):
		# 添加卷积层做滤波
		conv1 = tf.contrib.layers.convolution2d(
				word_vectors, N_FILTERS, FILTER_SHAPE1, padding='VALID')
		# 添加RELU非线性
		conv1 = tf.nn.relu(conv1)
		# 最大池化
		pool1 = tf.nn.max_pool(
				conv1,
				ksize=[1, POOLING_WINDOW, 1, 1],
				strides=[1, POOLING_STRIDE, 1, 1],
				padding='SAME')
		# 对矩阵进行转置，以满足形状
		pool1 = tf.transpose(pool1, [0, 1, 3, 2])
	with tf.variable_scope('CNN_Layer2'):
		# 第2个卷积层
		conv2 = tf.contrib.layers.convolution2d(
                pool1, N_FILTERS, FILTER_SHAPE2, padding='VALID')
		# 抽取特征
		pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])

	# 全连接层
	logits = tf.contrib.layers.fully_connected(pool2, 15, activation_fn=None)
	loss = tf.losses.softmax_cross_entropy(target, logits)

	train_op = tf.contrib.layers.optimize_loss(
			loss,
			tf.train.get_global_step,
			optimizer='Adam',
			learning_rate=0.01)

	return ({
			'class': tf.argmax(logits, 1),
			'prob': tf.nn.softmax(logits)
	}, loss, train_op)


#构建数据集
#x_train = pandas.DataFrame(train_data)[1]
#y_train = pandas.Series(train_target)
#x_test = pandas.DataFrame(test_data)[1]
#y_test = pandas.Series(test_target)

In [98]:
tmp = ['I am good', 'you are here', 'I am glad', 'it is great']
vocab_processor = learn.preprocessing.VocabularyProcessor(10, min_frequency=1)
list(vocab_processor.fit_transform(tmp))

[array([1, 2, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([1, 2, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]

In [101]:
global n_words
# 处理词汇
vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH, min_frequency=MIN_WORD_FREQUENCE)
x_train = np.array(list(vocab_processor.fit_transform(X_train)))
x_test = np.array(list(vocab_processor.transform(X_test)))
n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)

Total words: 2762


In [103]:
cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}
# train_target = map(lambda x:cate_dic[x], train_target)
# test_target = map(lambda x:cate_dic[x], test_target)
train_target = y_train
test_target = y_test

y_train = pandas.Series(train_target)
y_test = pandas.Series(test_target)

In [104]:
# 构建模型
classifier = learn.SKCompat(learn.Estimator(model_fn=cnn_model))

# 训练和预测
classifier.fit(x_train, y_train, steps=1000)
y_predicted = classifier.predict(x_test)['class']
score = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(score))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1c2dd1a3c8>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/var/folders/f2/djk1m_h90_b43vq4lr785cb80000gn/T/tmpvporszbm'}
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

Instructions for updating:
Please switch to tf.train.get_global_step
INFO:tensorflow:Creat

In [106]:
x_train

array([[2063,    0,   74, ...,    0,    0,    0],
       [1764, 2671, 2482, ...,    0,    0,    0],
       [   0,    0, 1266, ...,    0,    0,    0],
       ...,
       [   0,    0,  177, ...,    0,    0,    0],
       [2203, 1133,  418, ...,    0,    0,    0],
       [2669,  519,    0, ...,    0,    0,    0]])