In [1]:
import regex as re
import pandas as pd
import os
import random
import numpy as np
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='your module name')

In [2]:
sentiment_dict = np.load('sentiment_dict.npy',allow_pickle=True).item()
word_dict = np.load('word_dict.npz',allow_pickle=True)['word_dict'].item()
df = pd.read_csv('data.csv',index_col=0)
sentiments = df.sentiments
index = df.index
docs = df['seg_text'].values
fastText_vecs = []
for i in range(len(docs)):
    docs[i] = docs[i].split(' ')
    vec= np.zeros(300)
    for j in range(len(docs[i])):
        if docs[i][j] in word_dict:
            vec += word_dict[docs[i][j]]
    fastText_vecs.append(vec)
fastText_vecs_100 = np.array([fastText_vecs[i][:100] for i in range(len(fastText_vecs))])
fastText_vecs_300 = np.array(fastText_vecs)    
columns100 = ['topic{}'.format(i) for i in range(fastText_vecs_100.shape[1])]
columns300 = ['topic{}'.format(i) for i in range(fastText_vecs_300.shape[1])]
fastText_doc_100 = pd.DataFrame(fastText_vecs_100,columns=columns100,index=index)
fastText_vecs_100_sentiment = np.array([fastText_vecs_100[i]*sentiments[i] for i in range(len(fastText_vecs_100))])
fastText_doc_100_sentiment = pd.DataFrame(fastText_vecs_100_sentiment,columns=columns100,index=index)
fastText_doc_300 = pd.DataFrame(fastText_vecs_300,columns=columns300,index=index)
fastText_vecs_300_sentiment = np.array([fastText_vecs_300[i]*sentiments[i] for i in range(len(fastText_vecs_300))])
fastText_doc_300_sentiment = pd.DataFrame(fastText_vecs_300_sentiment,columns=columns300,index=index)

In [3]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split

def lda_predic (name,vector,df,over_sampling=False):
    X_train,X_test,y_train,y_test = train_test_split(vector,df.rumor,test_size=0.2 ,random_state=42)
    if over_sampling:
        smote = SMOTE(random_state=42)
        # 进行过采样
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        print("过采样前训练集中类别0的数量：", sum(y_train==0))
        print("过采样前训练集中类别1的数量：", sum(y_train==1))

        print("过采样后训练集中类别0的数量：", sum(y_train_res==0))
        print("过采样后训练集中类别1的数量：", sum(y_train_res==1))

    lda = LDA(n_components=1)
    lda = lda.fit(X_train_res, y_train_res)
    df[name] = lda.predict(vector)
    round(float(lda.score(X_test, y_test)), 3)
    from sklearn import metrics
    y_true = df.loc[y_test.index].rumor
    y_pred = df.loc[y_test.index][name]
    loss_total = 0
    acc = metrics.accuracy_score(y_true, y_pred)
    report = metrics.classification_report(y_true, y_pred, target_names=["non-rumor","rumor"], digits=4)
    confusion = metrics.confusion_matrix(y_true, y_pred)
    print(report)
    with open("report/"+name+'.txt', 'w', encoding='utf-8') as f:
        print(report,file=f)
    print(confusion)
    with open("report_matrix/"+name+'_matrix'+'.txt', 'w', encoding='utf-8') as f:
        print(confusion,file=f)
    np.savez("report_matrix/"+name+'_matrix'+'.npz', matrix= confusion)

In [4]:
lda_predic('fastText_lda_100',fastText_doc_100,df,over_sampling=True)
lda_predic('fastText_lda_100_sentiment',fastText_doc_100_sentiment,df,over_sampling=True)
lda_predic('fastText_lda_300',fastText_doc_300,df,over_sampling=True)
lda_predic('fastText_lda_300_sentiment',fastText_doc_300_sentiment,df,over_sampling=True)

过采样前训练集中类别0的数量： 1502
过采样前训练集中类别1的数量： 725
过采样后训练集中类别0的数量： 1502
过采样后训练集中类别1的数量： 1502
              precision    recall  f1-score   support

   non-rumor     1.0000    0.6883    0.8154       369
       rumor     0.6205    1.0000    0.7658       188

    accuracy                         0.7935       557
   macro avg     0.8102    0.8442    0.7906       557
weighted avg     0.8719    0.7935    0.7987       557

[[254 115]
 [  0 188]]
过采样前训练集中类别0的数量： 1502
过采样前训练集中类别1的数量： 725
过采样后训练集中类别0的数量： 1502
过采样后训练集中类别1的数量： 1502
              precision    recall  f1-score   support

   non-rumor     0.9905    0.5637    0.7185       369
       rumor     0.5360    0.9894    0.6953       188

    accuracy                         0.7074       557
   macro avg     0.7632    0.7765    0.7069       557
weighted avg     0.8371    0.7074    0.7107       557

[[208 161]
 [  2 186]]
过采样前训练集中类别0的数量： 1502
过采样前训练集中类别1的数量： 725
过采样后训练集中类别0的数量： 1502
过采样后训练集中类别1的数量： 1502
              precision    recall  f1-score   suppo