# 对中文新闻题目进行分类

本示例使用头条客户端抓取的新闻题目分类，演示如何用Amazon Sagemaker内置算法BlazingText对新闻标题进行分类。

原数据集下载地址：https://github.com/skdjfla/toutiao-text-classfication-dataset

In [19]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3
from random import shuffle

sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'blazingtext/supervised/toutiao' #Replace with the prefix under which you want to store the data if needed

In [2]:
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple jieba

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
[33mYou are using pip version 10.0.1, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import jieba
import re

In [4]:
index_to_label = {} 
with open("classes.txt") as f:
    for i,label in enumerate(f.readlines()):
        ll = label.strip().split(',')
        index_to_label[ll[0]] = ll[1]
print(index_to_label)

{'100': '民生', '101': '文化', '102': '娱乐', '103': '体育', '104': '财经', '106': '房产', '107': '汽车', '108': '教育', '109': '科技', '110': '军事', '112': '旅游', '113': '国际', '114': '证券', '115': '农业', '116': '电竞'}


### 使用分词

In [None]:
!unzip toutiao_cat_data.txt.zip

In [53]:
file  = 'toutiao_cat_data.txt'#'toutiao_cat_data.txt'
with open(file) as f:
    lines = f.readlines()
    
labels = []
for line in lines:
    label = []
    line = line.split('_!_')
    label_code = index_to_label[line[1]]
    label.append('__label__' + label_code)
    line[3] = re.sub(r"[\s+\.\!\/_,$%^*()?;；:-【】+\"\']+|[+——一！，;:：。？、~@#￥%……&*（）]+", "", line[3])
    label.extend(jieba.cut(line[3],cut_all=False))
#     print(label)
#     label = '__label__' + label_code + ' ' + artical_title
    labels.append(label)
    
shuffle(labels)
print(labels[0:5])

[['__label__财经', '偿付能力', '逼近', '监管', '红线', '弘康', '人寿', '补血', '涉险', '过关'], ['__label__国际', '越南', '拥有', '110', '余万', '军队', '为何', '越南', '军队', '最', '擅长', '布置', '竹签', '阵'], ['__label__娱乐', '网友', '偶遇', '钟丽缇', '在', '妇产科', '检查'], ['__label__汽车', '是', '做梦', '吗', '网传', '只花', '几十元', '爱车', '隔音', '效果', '堪比', '豪车'], ['__label__电竞', '衣之国', '今日', '不', '删档', '测试', '满足', '你', '的', '美丽', '梦想']]


In [54]:
prefix = 'blazingtext/toutiao'

In [55]:
t_train_data = labels[0:int(len(labels)*0.8)]
t_validation_data = labels[int(len(labels)*0.8):]

In [56]:
t_train_data[0:13]

[['__label__财经', '偿付能力', '逼近', '监管', '红线', '弘康', '人寿', '补血', '涉险', '过关'],
 ['__label__国际',
  '越南',
  '拥有',
  '110',
  '余万',
  '军队',
  '为何',
  '越南',
  '军队',
  '最',
  '擅长',
  '布置',
  '竹签',
  '阵'],
 ['__label__娱乐', '网友', '偶遇', '钟丽缇', '在', '妇产科', '检查'],
 ['__label__汽车',
  '是',
  '做梦',
  '吗',
  '网传',
  '只花',
  '几十元',
  '爱车',
  '隔音',
  '效果',
  '堪比',
  '豪车'],
 ['__label__电竞', '衣之国', '今日', '不', '删档', '测试', '满足', '你', '的', '美丽', '梦想'],
 ['__label__房产', '房屋买卖', '合同', '签署', '后', '卖方', '要求', '加价', '怎么办'],
 ['__label__科技',
  '支付宝',
  '出新招',
  '余额',
  '宝',
  '迎来',
  '新',
  '变化',
  '马云',
  '这下',
  '又',
  '要',
  '成功',
  '了'],
 ['__label__电竞',
  '西游',
  '释厄传',
  '看着',
  '蜘蛛',
  '王',
  '缓缓',
  '升起',
  '我',
  '就',
  '知道',
  '少不了',
  '顿',
  '揍',
  '了'],
 ['__label__教育', '为什么', '要', '考研', '考研', '有', '什么', '意义'],
 ['__label__文化',
  '中国',
  '颜体',
  '书法',
  '特展',
  '第三届',
  '宋璟',
  '碑',
  '颜体',
  '书法展',
  '特邀',
  '作品',
  '及',
  '入围',
  '作品'],
 ['__label__农业',
  '勉县',
  '老道',
  '寺镇',
  '张家湾',
  '村三变',
  '改革',


In [58]:
import csv
t_train_file = 'tt.train'
t_validation_file = 'tt.validation'

with open(t_train_file, 'w') as csvoutfile:
    csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
    csv_writer.writerows(t_train_data)
    
with open(t_validation_file, 'w') as csvoutfile:
    csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
    csv_writer.writerows(t_validation_data)

In [59]:
%%time

t_train_channel = prefix + '/train'
t_validation_channel = prefix + '/validation'

sess.upload_data(path='tt.train', bucket=bucket, key_prefix=t_train_channel)
sess.upload_data(path='tt.validation', bucket=bucket, key_prefix=t_validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, t_train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, t_validation_channel)

s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

CPU times: user 285 ms, sys: 91.5 ms, total: 377 ms
Wall time: 731 ms


In [None]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [None]:
region_name = boto3.Session().region_name
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

In [60]:
t_bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)
t_bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

In [63]:
t_train_data = sagemaker.inputs.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
t_validation_data = sagemaker.inputs.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
t_data_channels = {'train': t_train_data, 'validation': t_validation_data}

In [64]:
t_bt_model.fit(inputs=t_data_channels, logs=True)

2020-05-23 07:15:43 Starting - Starting the training job...
2020-05-23 07:15:55 Starting - Launching requested ML instances......
2020-05-23 07:16:56 Starting - Preparing the instances for training......
2020-05-23 07:18:03 Downloading - Downloading input data...
2020-05-23 07:18:44 Training - Training image download completed. Training in progress.
2020-05-23 07:18:44 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[05/23/2020 07:18:29 INFO 140690757523264] nvidia-smi took: 0.0252668857574 secs to identify 0 gpus[0m
[34m[05/23/2020 07:18:29 INFO 140690757523264] Running single machine CPU BlazingText training using supervised mode.[0m
[34m[05/23/2020 07:18:29 INFO 140690757523264] Processing /opt/ml/input/data/train/tt.train . File size: 24 MB[0m
[34m[05/23/2020 07:18:29 INFO 140690757523264] Processing /opt/ml/input/data/validation/tt.validation . File size: 6 MB[0m
[34mRead 3M words[0m
[34mNumber of words:  81004[0m
[34mLoading validation dat

In [67]:
t_text_classifier = t_bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.t2.medium')

Using already existing model: blazingtext-2020-05-23-07-15-43-787


-------------!

In [93]:
sentences = "邓超加入春晚"#"宝马推出新车型/亚马逊云计算Q1营收过百亿/美国航空母舰开往伊朗波斯湾/北京迎来黄金周小高峰/C罗纳尔多还是梅西"

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(jieba.cut(sentences,cut_all=False))]

payload = {"instances" : tokenized_sentences}

# payload = {"instances" : tokenized_sentences,
#           "configuration": {"k": 2}}

t_response = t_text_classifier.predict(json.dumps(payload))

t_predictions = json.loads(t_response)
print(json.dumps(t_predictions, indent=2))
t_predictions[0]['label']

[
  {
    "prob": [
      1.0
    ],
    "label": [
      "__label__\u5a31\u4e50"
    ]
  }
]


['__label__娱乐']