In [1]:
import pandas as pd

# 读取本地 CSV 文件
file_path = "/Users/jerry/Desktop/gossipcop_real.csv"
real_news = pd.read_csv(file_path)

# 查看前几行
print(real_news.head())

# 查看列名
print("\nColumns:", real_news.columns)

# 检查是否有缺失值
print("\nMissing Values:")
print(real_news.isnull().sum())

# 查看数据总数
print("\nTotal Entries:", len(real_news))

                 id                                           news_url  \
0  gossipcop-882573  https://www.brides.com/story/teen-mom-jenelle-...   
1  gossipcop-875924  https://www.dailymail.co.uk/tvshowbiz/article-...   
2  gossipcop-894416        https://en.wikipedia.org/wiki/Quinn_Perkins   
3  gossipcop-857248  https://www.refinery29.com/en-us/2018/03/19192...   
4  gossipcop-884684  https://www.cnn.com/2017/10/04/entertainment/c...   

                                               title  \
0  Teen Mom Star Jenelle Evans' Wedding Dress Is ...   
1  Kylie Jenner refusing to discuss Tyga on Life ...   
2                                      Quinn Perkins   
3  I Tried Kim Kardashian's Butt Workout & Am For...   
4  Celine Dion donates concert proceeds to Vegas ...   

                                           tweet_ids  
0  912371411146149888\t912371528343408641\t912372...  
1  901989917546426369\t901989992074969089\t901990...  
2  931263637246881792\t931265332022579201\t931265... 

In [2]:
import pandas as pd

# 读取本地 CSV 文件
file_path = "/Users/jerry/Desktop/gossipcop_fake.csv"
fake_news = pd.read_csv(file_path)

# 查看前几行
print("Fake News Sample:")
print(fake_news.head())

# 查看列名
print("\nColumns:", fake_news.columns)

# 检查是否有缺失值
print("\nMissing Values:")
print(fake_news.isnull().sum())

# 查看数据总数
print("\nTotal Entries:", len(fake_news))

Fake News Sample:
                     id                                           news_url  \
0  gossipcop-2493749932  www.dailymail.co.uk/tvshowbiz/article-5874213/...   
1  gossipcop-4580247171  hollywoodlife.com/2018/05/05/paris-jackson-car...   
2   gossipcop-941805037  variety.com/2017/biz/news/tax-march-donald-tru...   
3  gossipcop-2547891536  www.dailymail.co.uk/femail/article-3499192/Do-...   
4  gossipcop-5476631226  variety.com/2018/film/news/list-2018-oscar-nom...   

                                               title  \
0  Did Miley Cyrus and Liam Hemsworth secretly ge...   
1  Paris Jackson & Cara Delevingne Enjoy Night Ou...   
2  Celebrities Join Tax March in Protest of Donal...   
3  Cindy Crawford's daughter Kaia Gerber wears a ...   
4      Full List of 2018 Oscar Nominations – Variety   

                                           tweet_ids  
0  284329075902926848\t284332744559968256\t284335...  
1  992895508267130880\t992897935418503169\t992899...  
2  85335935

In [3]:
import pandas as pd

# 读取数据（请确保路径正确）
real_news = pd.read_csv("/Users/jerry/Desktop/gossipcop_real.csv")
fake_news = pd.read_csv("/Users/jerry/Desktop/gossipcop_fake.csv")

# 添加标签：真实新闻为1，假新闻为0
real_news["label"] = 1
fake_news["label"] = 0

# 合并数据集
df = pd.concat([real_news, fake_news], ignore_index=True)

# 检查类别分布
print("类别分布：")
print(df["label"].value_counts())

# 只保留新闻标题和标签（后续只用标题进行文本分类）
df = df[["title", "label"]]

# 保存合并后的数据（可选）
df.to_csv("/Users/jerry/Desktop/gossipcop_cleaned.csv", index=False)

类别分布：
label
1    16817
0     5323
Name: count, dtype: int64


In [4]:
import re
from nltk.corpus import stopwords

# 下载停用词（如果未下载）
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # 转为小写
    text = re.sub(r'[^\w\s]', '', text)  # 去除标点
    # 可选择性去除停用词
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# 添加清洗后的文本（仅用于 TF-IDF 和 Word2Vec）
df["clean_title"] = df["title"].apply(clean_text)

[nltk_data] Downloading package stopwords to /Users/jerry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# 使用清洗后的文本
X = df["clean_title"]
y = df["label"]

# 划分训练/验证/测试集（80/10/10）
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# TF-IDF 特征提取（可调 max_features 参数）
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 训练逻辑回归模型
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train_tfidf, y_train)

# 预测及评估
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
print("TF-IDF + 逻辑回归 分类报告：")
print(classification_report(y_test, y_pred_tfidf))
print("AUROC:", roc_auc_score(y_test, clf_tfidf.predict_proba(X_test_tfidf)[:,1]))

TF-IDF + 逻辑回归 分类报告：
              precision    recall  f1-score   support

           0       0.83      0.47      0.60       533
           1       0.85      0.97      0.91      1681

    accuracy                           0.85      2214
   macro avg       0.84      0.72      0.75      2214
weighted avg       0.85      0.85      0.83      2214

AUROC: 0.8711813860462314


In [6]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.svm import SVC

# 对训练集文本进行分词（基于清洗后的文本）
sentences = [text.split() for text in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, min_count=1, workers=4)

def avg_vector(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

X_train_w2v = np.array([avg_vector(text, w2v_model) for text in X_train])
X_test_w2v = np.array([avg_vector(text, w2v_model) for text in X_test])

# 使用 SVM 分类器
svm_clf = SVC(probability=True)
svm_clf.fit(X_train_w2v, y_train)
y_pred_w2v = svm_clf.predict(X_test_w2v)

print("Word2Vec + SVM 分类报告：")
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_w2v))

Word2Vec + SVM 分类报告：
              precision    recall  f1-score   support

           0       0.77      0.23      0.35       533
           1       0.80      0.98      0.88      1681

    accuracy                           0.80      2214
   macro avg       0.78      0.60      0.62      2214
weighted avg       0.79      0.80      0.75      2214



In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# 加载数据
df = pd.read_csv("/Users/jerry/Desktop/gossipcop_cleaned.csv")

X = df["title"]
y = df["label"]

# 划分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 特征提取：简单的长度统计特征
def simple_length_features(texts):
    features = []
    for text in texts:
        features.append([
            len(text),                      # 标题字符总数
            len(text.split()),              # 标题单词数
            np.mean([len(w) for w in text.split()]),  # 单词平均长度
        ])
    return np.array(features)

train_features = simple_length_features(X_train)
test_features = simple_length_features(X_test)

# 训练逻辑回归分类器
clf = LogisticRegression(max_iter=1000)
clf.fit(train_features, y_train)

# 预测和评估
y_pred = clf.predict(test_features)
y_prob = clf.predict_proba(test_features)[:, 1]

print("分类报告（标题长度特征）:")
print(classification_report(y_test, y_pred))

auc = roc_auc_score(y_test, y_prob)
print("AUROC:", auc)


分类报告（标题长度特征）:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1065
           1       0.76      1.00      0.86      3363

    accuracy                           0.76      4428
   macro avg       0.38      0.50      0.43      4428
weighted avg       0.58      0.76      0.66      4428

AUROC: 0.540773873092854


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
