In [1]:
import pandas as pd
import pickle
import random
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
set_seed(42)

In [3]:
def prepare_features_from_file(
    file_path,
    continue_feature_list,
    cat_feature_list,
    hidden_feature_list,
    target_col='shared_class',
    scaler=None
):
    """
    从文件中读取数据并预处理特征。适配特征消融实验，支持部分特征为空。

    参数：
        file_path : str
            pkl 文件路径。
        continue_feature_list : list of str
            连续特征列名。
        cat_feature_list : list of str
            类别特征列名。
        hidden_feature_list : list of str
            向量/嵌入特征列名。
        target_col : str
            标签列名，默认 'shared_class'。
        scaler : StandardScaler or None
            可选。若传入，将复用已有 scaler。

    返回：
        X : pd.DataFrame
        y : pd.Series
    """
    with open(file_path, 'rb') as f:
        df = pickle.load(f)

    feature_parts = []

    # 处理连续特征
    if continue_feature_list:
        if scaler is None:
            scaler = StandardScaler()
            X_cont_scaled = scaler.fit_transform(df[continue_feature_list])
        else:
            X_cont_scaled = scaler.transform(df[continue_feature_list])
        cont_scaled_df = pd.DataFrame(X_cont_scaled, columns=continue_feature_list, index=df.index)
        feature_parts.append(cont_scaled_df)

    # 处理类别特征
    if cat_feature_list:
        df[cat_feature_list] = df[cat_feature_list].astype(str)
        cat_encoded_df = pd.get_dummies(df[cat_feature_list], drop_first=False)
        feature_parts.append(cat_encoded_df)

    # 处理嵌入特征
    def expand_vector_features(df, feature_names):
        expanded = []
        for col in feature_names:
            # 若该列为空（或全为None），跳过
            if df[col].isnull().all():
                continue
            expanded_cols = pd.DataFrame(df[col].tolist(), 
                                         index=df.index,
                                         columns=[f"{col}_{i}" for i in range(len(df[col].iloc[0]))])
            expanded.append(expanded_cols)
        return pd.concat(expanded, axis=1) if expanded else pd.DataFrame(index=df.index)

    if hidden_feature_list:
        X_hidden = expand_vector_features(df, hidden_feature_list)
        feature_parts.append(X_hidden)


    X = pd.concat([part.reset_index(drop=True) for part in feature_parts], axis=1).astype(float)
    y = df[target_col]

    return X, y

In [4]:
def evaluate_classifiers(X_train, X_test, y_train, y_test):

    # 所有模型及其名称（可扩展）
    models = {
        "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
        "Support Vector Machine (SVM)": SVC(random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42)
    }

    # 遍历模型并评估
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)

        print(f"\n{name}:")
        print(f"F1 Score: {f1:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"Accuracy: {accuracy:.4f}")

In [5]:
def train(continue_features, cat_features, hidden_features):
    # 获得特征
    X, y = prepare_features_from_file(
    file_path='../data/bert_data.pkl',
    continue_feature_list=continue_features,
    cat_feature_list=cat_features,
    hidden_feature_list=hidden_features
)
    # 将数据分为训练集和测试集（80%训练集，20%测试集）
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    evaluate_classifiers(X_train, X_test, y_train, y_test)


In [6]:
# 全部特征
continue_features = [ "create_time", "follows", "fans", "content_len"]

cat_features = [ "gender", "sentiment_class","post_day", "post_weekday", 
                  "post_month", "post_hour", "post_minute"]

hidden_features = [ "content_wv_embed", "desc_wv_embed", 
                    "content_tfidf", "desc_tfidf", "embedding"]
train(continue_features, cat_features, hidden_features)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression:
F1 Score: 0.6516
Precision: 0.6519
Recall: 0.6515
Accuracy: 0.6515

Support Vector Machine (SVM):
F1 Score: 0.6703
Precision: 0.6768
Recall: 0.6695
Accuracy: 0.6695

Decision Tree:
F1 Score: 0.6924
Precision: 0.6929
Recall: 0.6920
Accuracy: 0.6920

Random Forest:
F1 Score: 0.7493
Precision: 0.7532
Recall: 0.7501
Accuracy: 0.7501


In [7]:
# 没有文本特征
continue_features = [ "create_time", "follows", "fans", "content_len"]

cat_features = [ "gender", "sentiment_class","post_day", "post_weekday", 
                  "post_month", "post_hour", "post_minute"]

hidden_features = []
train(continue_features, cat_features, hidden_features)


Logistic Regression:
F1 Score: 0.5098
Precision: 0.5177
Recall: 0.5110
Accuracy: 0.5110

Support Vector Machine (SVM):
F1 Score: 0.6259
Precision: 0.6364
Recall: 0.6249
Accuracy: 0.6249

Decision Tree:
F1 Score: 0.7048
Precision: 0.7052
Recall: 0.7046
Accuracy: 0.7046

Random Forest:
F1 Score: 0.7498
Precision: 0.7525
Recall: 0.7501
Accuracy: 0.7501


In [11]:
# 没有类别特征
continue_features = ["create_time", "follows", "fans", "content_len"]

cat_features = []

hidden_features = [ "content_wv_embed", "desc_wv_embed", 
                    "content_tfidf", "desc_tfidf", "embedding"]

train(continue_features, cat_features, hidden_features)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression:
F1 Score: 0.6516
Precision: 0.6521
Recall: 0.6520
Accuracy: 0.6520

Support Vector Machine (SVM):
F1 Score: 0.6625
Precision: 0.6678
Recall: 0.6623
Accuracy: 0.6623

Decision Tree:
F1 Score: 0.6963
Precision: 0.6972
Recall: 0.6956
Accuracy: 0.6956

Random Forest:
F1 Score: 0.7552
Precision: 0.7579
Recall: 0.7560
Accuracy: 0.7560


In [12]:
# 没有连续特征
continue_features = []

cat_features = [ "gender", "sentiment_class","post_day", "post_weekday", 
                  "post_month", "post_hour", "post_minute"]

hidden_features = [ "content_wv_embed", "desc_wv_embed", 
                    "content_tfidf", "desc_tfidf", "embedding"]

train(continue_features, cat_features, hidden_features)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression:
F1 Score: 0.6317
Precision: 0.6325
Recall: 0.6312
Accuracy: 0.6312

Support Vector Machine (SVM):
F1 Score: 0.6612
Precision: 0.6682
Recall: 0.6614
Accuracy: 0.6614

Decision Tree:
F1 Score: 0.6412
Precision: 0.6406
Recall: 0.6421
Accuracy: 0.6421

Random Forest:
F1 Score: 0.7470
Precision: 0.7540
Recall: 0.7479
Accuracy: 0.7479
