In [38]:
import pickle
import copy
import random
import pandas as pd
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler


In [37]:
def set_seed(seed):
    random.seed(seed)  # Python 的随机模块
    np.random.seed(seed)  # NumPy 的随机模块
    torch.manual_seed(seed)  # PyTorch CPU 的随机模块
    torch.cuda.manual_seed(seed)  # PyTorch GPU 的随机模块
    torch.cuda.manual_seed_all(seed)  # 多GPU时
    torch.backends.cudnn.deterministic = True  # 让CUDNN使用确定性算法
    torch.backends.cudnn.benchmark = False  # 禁用自动寻找最佳卷积算法
set_seed(42)

In [9]:
class TweetDataset(Dataset):
    def __init__(self, features, labels):
        self.X = features
        self.y = labels

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
def prepare_dataset_from_file(
    file_path,
    continue_feature_list=None,
    cat_feature_list=None,
    hidden_feature_list=None,
    target_col='shared_class'
):
    """
    从文件中读取数据并预处理特征，返回 PyTorch Dataset（特征和标签为张量形式）。

    参数：
        file_path : str
            pkl 文件路径。
        continue_feature_list : list of str or None
        cat_feature_list : list of str or None
        hidden_feature_list : list of str or None
        target_col : str
        scaler : StandardScaler or None

    返回：
        dataset : TweetDataset
        scaler : StandardScaler
    """
    with open(file_path, 'rb') as f:
        df = pickle.load(f)

    feature_parts = []

    # 连续特征
    if continue_feature_list:
        scaler = StandardScaler()
        X_cont_scaled = scaler.fit_transform(df[continue_feature_list])
        cont_scaled_df = pd.DataFrame(X_cont_scaled, columns=continue_feature_list, index=df.index)
        feature_parts.append(cont_scaled_df)

    # 类别特征
    if cat_feature_list:
        df[cat_feature_list] = df[cat_feature_list].astype(str)
        cat_encoded_df = pd.get_dummies(df[cat_feature_list], drop_first=False)
        feature_parts.append(cat_encoded_df)

    # 文本/向量特征
    if hidden_feature_list:
        def expand_vector_features(df, feature_names):
            expanded = []
            for col in feature_names:
                expanded_cols = pd.DataFrame(df[col].tolist(), 
                                             index=df.index,
                                             columns=[f"{col}_{i}" for i in range(len(df[col].iloc[0]))])
                expanded.append(expanded_cols)
            return pd.concat(expanded, axis=1)
        X_hidden = expand_vector_features(df, hidden_feature_list)
        feature_parts.append(X_hidden)

    
    X = pd.concat(feature_parts, axis=1).astype(np.float32).values
    y = df[target_col].values
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.long)

    dataset = TweetDataset(X_tensor, y_tensor)
    return dataset

In [11]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.net(x)

In [12]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0.001):
        self.patience = patience
        self.delta = delta
        self.best_acc = None  # 改为基于准确率判断
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_acc):
        # 若没有找到最佳准确率，或当前准确率比最好的准确率提升超过delta，则更新
        if self.best_acc is None or val_acc > self.best_acc + self.delta:
            self.best_acc = val_acc
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [27]:
class Trainer:
    def __init__(self, dataset: Dataset, batch_size=16):
        self.dataset = dataset
        self.batch_size = batch_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


    def prepare_data(self):
        train_size = int(0.8 * len(self.dataset))
        val_size = int(0.1 * len(self.dataset))
        test_size = len(self.dataset) - train_size - val_size

        train_dataset, val_dataset, test_dataset = random_split(self.dataset, [train_size, val_size, test_size])

        self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        self.val_loader = DataLoader(val_dataset, batch_size=self.batch_size)
        self.test_loader = DataLoader(test_dataset, batch_size=self.batch_size)

    def train_and_evaluate(self, num_epochs=50):

        self.model = MLP(input_dim =self.dataset.X.shape[1], hidden_dim = 1024, output_dim=4).to(self.device)

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5, weight_decay=1e-2)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)
        criterion = nn.CrossEntropyLoss()
        early_stopper = EarlyStopping(patience=5)
        best_val_acc = 0
        best_model_wts = None

        for epoch in range(num_epochs):
            self.model.train()
            running_loss = 0.0
            y_true_train, y_pred_train = [], []

            for X_batch, y_batch in self.train_loader:
                X_batch, y_batch = X_batch.to(self.device), y_batch.to(self.device)

                optimizer.zero_grad()
                outputs = self.model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                y_true_train.extend(y_batch.cpu().numpy())
                y_pred_train.extend(predicted.cpu().numpy())
            # 获得训练集上的准确率
            train_acc = accuracy_score(y_true_train, y_pred_train)
            val_acc = self.evaluate(self.val_loader)
            test_acc = self.evaluate(self.test_loader)

                            # 早停策略检查
            if val_acc > best_val_acc:
                best_model_wts = copy.deepcopy(self.model.state_dict())

            scheduler.step(val_acc)  

            print(f"Epoch {epoch} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} | Test Acc: {test_acc:.4f}")
            current_lr = optimizer.param_groups[0]['lr']
            print(f"Current learning rate: {current_lr:.6f}")
            early_stopper(val_acc)
            if early_stopper.early_stop:
                print("Early stopping triggered.")
                break
        self.model.load_state_dict(best_model_wts)
        self.final_evaluation()

    def evaluate(self, loader):
        self.model.eval()
        y_true_val, y_pred_val = [], []
        with torch.no_grad():
            for X_batch, y_batch in loader:
                X_batch, y_batch = X_batch.to(self.device), y_batch.to(self.device)
                outputs = self.model(X_batch)
                _, predicted = torch.max(outputs, 1)
                y_true_val.extend(y_batch.cpu().numpy())
                y_pred_val.extend(predicted.cpu().numpy())

        val_acc = accuracy_score(y_true_val, y_pred_val)
        return val_acc
    

    def final_evaluation(self):
    
        # 最终测试性能输出
        self.model.eval()
        y_true_final, y_pred_final = [], []
        with torch.no_grad():
            for X_batch, y_batch in self.test_loader:
                X_batch, y_batch = X_batch.to(self.device), y_batch.to(self.device)
                outputs = self.model(X_batch)
                _, predicted = torch.max(outputs, 1)
                y_true_final.extend(y_batch.cpu().numpy())
                y_pred_final.extend(predicted.cpu().numpy())
                
        print("\nFinal model evaluation on test set:")
        mlp_f1 = f1_score(y_true_final, y_pred_final, average='weighted')
        mlp_precision = precision_score(y_true_final, y_pred_final, average='weighted')
        mlp_recall = recall_score(y_true_final, y_pred_final, average='weighted')
        mlp_accuracy = accuracy_score(y_true_final, y_pred_final)

        print("\nMLP:")
        print(f"F1 Score: {mlp_f1}")
        print(f"Precision: {mlp_precision}")
        print(f"Recall: {mlp_recall}")
        print(f"Accuracy: {mlp_accuracy}")

In [28]:
continue_features = [ "create_time", "follows", "fans", "content_len"]

cat_features = [ "gender", "sentiment_class","post_day", "post_weekday", 
                  "post_month", "post_hour", "post_minute"]

hidden_features = [ "content_wv_embed", "desc_wv_embed", 
                    "content_tfidf", "desc_tfidf", "embedding"]

dataset = prepare_dataset_from_file("../data/bert_data.pkl",continue_features , cat_features, hidden_features)

In [29]:
train = Trainer(dataset)
train.prepare_data()

In [30]:
train.train_and_evaluate()

Epoch 0 | Train Acc: 0.5162 | Val Acc: 0.6144 | Test Acc: 0.6364
Current learning rate: 0.000020
Epoch 1 | Train Acc: 0.6193 | Val Acc: 0.6523 | Test Acc: 0.6463
Current learning rate: 0.000020
Epoch 2 | Train Acc: 0.6543 | Val Acc: 0.6649 | Test Acc: 0.6607
Current learning rate: 0.000020
Epoch 3 | Train Acc: 0.6796 | Val Acc: 0.6712 | Test Acc: 0.6670
Current learning rate: 0.000020
Epoch 4 | Train Acc: 0.7040 | Val Acc: 0.6928 | Test Acc: 0.6760
Current learning rate: 0.000020
Epoch 5 | Train Acc: 0.7302 | Val Acc: 0.7099 | Test Acc: 0.6787
Current learning rate: 0.000020
Epoch 6 | Train Acc: 0.7485 | Val Acc: 0.7027 | Test Acc: 0.6949
Current learning rate: 0.000020
Epoch 7 | Train Acc: 0.7624 | Val Acc: 0.7279 | Test Acc: 0.7111
Current learning rate: 0.000020
Epoch 8 | Train Acc: 0.7745 | Val Acc: 0.7270 | Test Acc: 0.7138
Current learning rate: 0.000020
Epoch 9 | Train Acc: 0.7917 | Val Acc: 0.7270 | Test Acc: 0.7066
Current learning rate: 0.000020
Epoch 00011: reducing learning

In [31]:
# 没有连续特征
continue_features = []

cat_features = [ "gender", "sentiment_class","post_day", "post_weekday", 
                  "post_month", "post_hour", "post_minute"]

hidden_features = [ "content_wv_embed", "desc_wv_embed", 
                    "content_tfidf", "desc_tfidf", "embedding"]

dataset = prepare_dataset_from_file("../data/bert_data.pkl",continue_features , cat_features, hidden_features)
train = Trainer(dataset)
train.prepare_data()
train.train_and_evaluate()

Epoch 0 | Train Acc: 0.5078 | Val Acc: 0.6180 | Test Acc: 0.5977
Current learning rate: 0.000020
Epoch 1 | Train Acc: 0.6132 | Val Acc: 0.6631 | Test Acc: 0.6427
Current learning rate: 0.000020
Epoch 2 | Train Acc: 0.6433 | Val Acc: 0.6793 | Test Acc: 0.6589
Current learning rate: 0.000020
Epoch 3 | Train Acc: 0.6706 | Val Acc: 0.6712 | Test Acc: 0.6652
Current learning rate: 0.000020
Epoch 4 | Train Acc: 0.6908 | Val Acc: 0.6847 | Test Acc: 0.6832
Current learning rate: 0.000020
Epoch 5 | Train Acc: 0.7144 | Val Acc: 0.6892 | Test Acc: 0.6868
Current learning rate: 0.000020
Epoch 6 | Train Acc: 0.7322 | Val Acc: 0.6973 | Test Acc: 0.6985
Current learning rate: 0.000020
Epoch 7 | Train Acc: 0.7553 | Val Acc: 0.6955 | Test Acc: 0.7093
Current learning rate: 0.000020
Epoch 8 | Train Acc: 0.7620 | Val Acc: 0.7117 | Test Acc: 0.7030
Current learning rate: 0.000020
Epoch 9 | Train Acc: 0.7752 | Val Acc: 0.7234 | Test Acc: 0.7219
Current learning rate: 0.000020
Epoch 10 | Train Acc: 0.7956 |

In [34]:
# 没有文本特征
continue_features = ["create_time", "follows", "fans", "content_len"]

cat_features = []


hidden_features = [ "content_wv_embed", "desc_wv_embed", 
                    "content_tfidf", "desc_tfidf", "embedding"]

dataset = prepare_dataset_from_file("../data/bert_data.pkl",continue_features , cat_features, hidden_features)
train = Trainer(dataset)
train.prepare_data()
train.train_and_evaluate()

Epoch 0 | Train Acc: 0.5267 | Val Acc: 0.6252 | Test Acc: 0.6274
Current learning rate: 0.000020
Epoch 1 | Train Acc: 0.6080 | Val Acc: 0.6586 | Test Acc: 0.6535
Current learning rate: 0.000020
Epoch 2 | Train Acc: 0.6500 | Val Acc: 0.6721 | Test Acc: 0.6823
Current learning rate: 0.000020
Epoch 3 | Train Acc: 0.6778 | Val Acc: 0.6865 | Test Acc: 0.6895
Current learning rate: 0.000020
Epoch 4 | Train Acc: 0.6940 | Val Acc: 0.6973 | Test Acc: 0.7111
Current learning rate: 0.000020
Epoch 5 | Train Acc: 0.7106 | Val Acc: 0.7036 | Test Acc: 0.7102
Current learning rate: 0.000020
Epoch 6 | Train Acc: 0.7290 | Val Acc: 0.7090 | Test Acc: 0.7129
Current learning rate: 0.000020
Epoch 7 | Train Acc: 0.7499 | Val Acc: 0.7207 | Test Acc: 0.7273
Current learning rate: 0.000020
Epoch 8 | Train Acc: 0.7568 | Val Acc: 0.7324 | Test Acc: 0.7336
Current learning rate: 0.000020
Epoch 9 | Train Acc: 0.7718 | Val Acc: 0.7351 | Test Acc: 0.7336
Current learning rate: 0.000020
Epoch 10 | Train Acc: 0.7862 |

In [35]:
#  没有文本特征
continue_features = ["create_time", "follows", "fans", "content_len"]

cat_features = [ "gender", "sentiment_class","post_day", "post_weekday", 
                  "post_month", "post_hour", "post_minute"]

hidden_features = []

dataset = prepare_dataset_from_file("../data/bert_data.pkl",continue_features , cat_features, hidden_features)
train = Trainer(dataset)
train.prepare_data()
train.train_and_evaluate()

Epoch 0 | Train Acc: 0.4656 | Val Acc: 0.5378 | Test Acc: 0.5554
Current learning rate: 0.000020
Epoch 1 | Train Acc: 0.5155 | Val Acc: 0.5667 | Test Acc: 0.5635
Current learning rate: 0.000020
Epoch 2 | Train Acc: 0.5318 | Val Acc: 0.5432 | Test Acc: 0.5473
Current learning rate: 0.000020
Epoch 3 | Train Acc: 0.5508 | Val Acc: 0.5739 | Test Acc: 0.5770
Current learning rate: 0.000020
Epoch 4 | Train Acc: 0.5639 | Val Acc: 0.5676 | Test Acc: 0.5761
Current learning rate: 0.000020
Epoch 5 | Train Acc: 0.5844 | Val Acc: 0.5739 | Test Acc: 0.5770
Current learning rate: 0.000020
Epoch 6 | Train Acc: 0.5854 | Val Acc: 0.5892 | Test Acc: 0.5923
Current learning rate: 0.000020
Epoch 7 | Train Acc: 0.5901 | Val Acc: 0.5829 | Test Acc: 0.5905
Current learning rate: 0.000020
Epoch 8 | Train Acc: 0.6041 | Val Acc: 0.5865 | Test Acc: 0.5851
Current learning rate: 0.000020
Epoch 9 | Train Acc: 0.6214 | Val Acc: 0.5964 | Test Acc: 0.6139
Current learning rate: 0.000020
Epoch 10 | Train Acc: 0.6270 |