<img style="float: right;" src="https://doc.shiyanlou.com/document-uid214893labid7506timestamp1555400601684.png">

# 深度学习完成假新闻分类

---

<i class="fa fa-exclamation-circle" aria-hidden="true"> 以下内容仅保留挑战参考答案代码部分，完整挑战请到原课程页面查看。</i>

---

In [None]:
!wget -nc "http://labfile.oss.aliyuncs.com/courses/1233/wsdm_mini.csv"  # 假新闻数据
!wget -nc "http://labfile.oss.aliyuncs.com/courses/1176/stopwords.txt"  # 停用词词典

In [None]:
import pandas as pd

df = pd.read_csv("wsdm_mini.csv")
df['title_zh'] = df[['title1_zh', 'title2_zh']].apply(
    lambda x: ''.join(x), axis=1)  # 合并文本数据列
df.head()

In [None]:
import jieba
from tqdm import tqdm_notebook

def load_stopwords(file_path):
    with open(file_path, 'r') as f:
        stopwords = [line.strip('\n') for line in f.readlines()]
    return stopwords

stopwords = load_stopwords('stopwords.txt')

corpus = []
for line in tqdm_notebook(df['title_zh']):
    words = []
    seg_list = list(jieba.cut(line))  # 分词
    for word in seg_list:
        if word in stopwords:  # 删除停用词
            continue
        words.append(word)
    corpus.append(" ".join(words))

In [None]:
import tensorflow as tf

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer

In [None]:
tokenizer.fit_on_texts(corpus)
X_ = tokenizer.texts_to_sequences(corpus)

In [None]:
for seq in X_[:1]:
    print([tokenizer.index_word[idx] for idx in seq])

In [None]:
X = tf.keras.preprocessing.sequence.pad_sequences(X_, maxlen=20)
X.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
y_onehot = encoder.fit_transform(df.label.values.reshape(len(df), -1))
y_onehot

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(10000, 16, input_length=20))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.summary()

In [None]:
model.compile(optimizer='Adam', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, 64, 10, validation_data=(X_test, y_test))

---

<img src="https://doc.shiyanlou.com/document-uid214893labid7506timestamp1545810029884.png">