In [5]:
import pandas as pd

df = pd.read_csv("../../../data/wsdm_mini.csv")
df.head()

Unnamed: 0,title1_zh,title2_zh,label
0,千叶湖八岁孩子不想去学英语，跳楼了,「辟谣」千叶湖八岁孩子跳楼了为谣言信息,disagreed
1,喝酸奶真的能补充益生菌吗？,喝酸奶来补充益生菌，靠谱么？,agreed
2,刚刚马云终于出手了！房价要跌，扬言房地产中介都要失业了,最新消息马云终于出手了！扬言房地产中介都要失业！6,agreed
3,直击“冯乡长”李正春追悼会：赵本山全程操办，赵四刘能现场祭奠,昆明会议直击“活摘”谣言,unrelated
4,李雨桐爆薛之谦离婚内幕，说到底就是网红之间的恩怨情仇嘛,薛之谦前女友李雨桐再次发微博爆料，薛之谦工作室发声明辟谣,disagreed


In [6]:
# 合并文本，然后对文本进行分词处理 

df['title_zh'] = df[['title1_zh', 'title2_zh']].apply(lambda x: ''.join(x), axis=1)  # 合并文本数据列
df_merge = df.drop(df.columns[[0, 1]], axis=1)  # 删除原文本列
df_merge.head()

Unnamed: 0,label,title_zh
0,disagreed,千叶湖八岁孩子不想去学英语，跳楼了「辟谣」千叶湖八岁孩子跳楼了为谣言信息
1,agreed,喝酸奶真的能补充益生菌吗？喝酸奶来补充益生菌，靠谱么？
2,agreed,刚刚马云终于出手了！房价要跌，扬言房地产中介都要失业了最新消息马云终于出手了！扬言房地产中介...
3,unrelated,直击“冯乡长”李正春追悼会：赵本山全程操办，赵四刘能现场祭奠昆明会议直击“活摘”谣言
4,disagreed,李雨桐爆薛之谦离婚内幕，说到底就是网红之间的恩怨情仇嘛薛之谦前女友李雨桐再次发微博爆料，薛之...


In [7]:
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='UTF-8') as f:
        stopwords = [line.strip('\n') for line in f.readlines()]
    return stopwords
stopwords = load_stopwords('../../../data/stopwords.txt')
stopwords[0:5]

['!', '"', '#', '$', '%']

In [8]:
from tqdm.notebook import tqdm
import jieba

# 利用 jieba 进行分词，并且取出停用词

corpus = []
for line in tqdm(df['title_zh']):
    words = []
    seg_list = list(jieba.cut(line))  # 分词
    for word in seg_list:
        if word in stopwords:  # 删除停用词
            continue
        words.append(word)
    corpus.append(words)

  0%|          | 0/15000 [00:00<?, ?it/s]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\91658\AppData\Local\Temp\jieba.cache
Loading model cost 0.483 seconds.
Prefix dict has been built successfully.


In [9]:
import numpy as np
from gensim.models import Word2Vec

model = Word2Vec(corpus)  # 词嵌入，默认 size=100

# 整个句子所有单词向量的和，得到空间向量上这个句子的向量
def sum_vec(text):
    vec = np.zeros(100).reshape((1, 100))  # 初始化一个和 Word2Vec 嵌入等长度的 0 向量
    for word in text:
        # 得到句子中每个词的词向量并累加在一起
        if word in list(model.wv.index_to_key):
            vec += model.wv.get_vector(word).reshape((1, 100))
        else:
            pass
    return vec

# 将词向量保存为 Ndarray
X = np.concatenate([sum_vec(z) for z in tqdm(corpus)])
X.shape

  0%|          | 0/15000 [00:00<?, ?it/s]

(15000, 100)

In [10]:
import tensorflow as tf

# 限定词典大小：Tokenizer 会基于你的文本数据构建一个词典（即词汇表），这个词典中会包含你文本数据中最常见的单词。num_words=10000 表示词典中只保留文本数据中最常见的前10,000个单词
# 不保留低频词
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x1fe1bf24a90>

In [17]:
# 基于提供的 corpus（语料库）构建词典
tokenizer.fit_on_texts(corpus)

# 将文本数据转换为数字序列
X_ = tokenizer.texts_to_sequences(corpus)
X_[0]

[8050,
 3262,
 6997,
 16,
 690,
 8051,
 199,
 19,
 1,
 8050,
 3262,
 6997,
 16,
 199,
 2,
 239]

In [19]:
for seq in X_[:1]:
    print([tokenizer.index_word[idx] for idx in seq])

# 补充输入序列长度，这里是限定输入序列，并不是实际向量，向量要经过 embedding，X_ 是词典索引数组
# 索引序列是将文本中的每个词（或字）转换为一个唯一的整数表示，这种表示形式便于后续操作。每个索引都代表了词典中的一个词，使得整个输入序列能够统一处理
X = tf.keras.preprocessing.sequence.pad_sequences(X_, maxlen=20)
X.shape,X[0]

['千叶', '湖', '八岁', '孩子', '不想', '英语', '跳楼', '「', '辟谣', '千叶', '湖', '八岁', '孩子', '跳楼', '谣言', '信息']


((15000, 20),
 array([   0,    0,    0,    0, 8050, 3262, 6997,   16,  690, 8051,  199,
          19,    1, 8050, 3262, 6997,   16,  199,    2,  239]))

In [20]:
# 将 label 转化为 0、1 等状态，方便计算
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
y_onehot = encoder.fit_transform(df.label.values.reshape(len(df), -1))
y_onehot

<15000x3 sparse matrix of type '<class 'numpy.float64'>'
	with 15000 stored elements in Compressed Sparse Row format>

In [25]:
model = tf.keras.Sequential()
# input_length 每个输入序列的固定长度是 20 个词
# 10000，嵌入层的输入维度 === 词典长度，模型只考虑词典中前 10,000 个最常见的词
# 16 输出向量维度，每个词将被转换为一个 16 维的向量
model.add(tf.keras.layers.Embedding(10000, 16, input_length=20))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.summary()

In [45]:
y_onehot

<15000x3 sparse matrix of type '<class 'numpy.float64'>'
	with 15000 stored elements in Compressed Sparse Row format>

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot.toarray(), test_size=0.2)

In [47]:
y_train

array([[0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [48]:
model.compile(optimizer='Adam', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, 64, 10, validation_data=(X_test, y_test))

Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node sequential_3_1/embedding_3_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\asyncio\base_events.py", line 608, in run_forever

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\asyncio\base_events.py", line 1936, in _run_once

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\asyncio\events.py", line 84, in _run

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\91658\AppData\Local\Temp\ipykernel_118080\1008936545.py", line 3, in <module>

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 329, in fit

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 122, in one_step_on_iterator

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 110, in one_step_on_data

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 57, in train_step

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\layers\layer.py", line 826, in __call__

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\ops\operation.py", line 48, in __call__

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\models\sequential.py", line 206, in call

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\models\functional.py", line 199, in call

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\ops\function.py", line 151, in _run_through_graph

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\models\functional.py", line 583, in call

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\layers\layer.py", line 826, in __call__

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\ops\operation.py", line 48, in __call__

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\layers\core\embedding.py", line 130, in call

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\ops\numpy.py", line 4962, in take

  File "C:\Users\91658\Miniconda3\envs\ml\Lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 1740, in take

indices[0,0] = 0 is not in [0, 0)
	 [[{{node sequential_3_1/embedding_3_1/GatherV2}}]] [Op:__inference_one_step_on_iterator_2230]