In [1]:
## 读取csv，并处理用户和微博数据
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
# 读取数据
users = pd.read_csv('../data/users.csv')
posts = pd.read_csv('../data/posts.csv')

In [3]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   user_id         20 non-null     int64  
 1   nickname        20 non-null     object 
 2   gender          20 non-null     object 
 3   avatar          20 non-null     object 
 4   desc            16 non-null     object 
 5   ip_location     0 non-null      float64
 6   follows         20 non-null     int64  
 7   fans            20 non-null     object 
 8   tag_list        0 non-null      float64
 9   last_modify_ts  20 non-null     int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 1.7+ KB


In [4]:
posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9765 entries, 0 to 9764
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   note_id           9765 non-null   int64  
 1   content           9759 non-null   object 
 2   create_time       9765 non-null   int64  
 3   create_date_time  9765 non-null   object 
 4   liked_count       9765 non-null   int64  
 5   comments_count    9765 non-null   int64  
 6   shared_count      9765 non-null   int64  
 7   last_modify_ts    9765 non-null   int64  
 8   note_url          9765 non-null   object 
 9   ip_location       5497 non-null   object 
 10  user_id           9765 non-null   int64  
 11  nickname          9765 non-null   object 
 12  gender            9765 non-null   object 
 13  profile_url       9765 non-null   object 
 14  avatar            9765 non-null   object 
 15  source_keyword    0 non-null      float64
dtypes: float64(1), int64(7), object(8)
memory 

In [5]:
# 将粉丝数转化为number
def parse_fans(value):
    if isinstance(value, str):
        if '万' in value:
            return float(value.replace('万', '')) * 10000
        try:
            return float(value)
        except:
            return None
    return value
users['fans'] = users['fans'].apply(parse_fans)

In [6]:
# 性别转化
users['gender'] = users['gender'].map({'男': 'male', '女': 'female', 'm': 'male', 'f': 'female'})

In [7]:
# 删除多余列
drop_cols = [
    'nickname', 'avatar', 'ip_location',
    'tag_list', 'last_modify_ts'
]
users.drop(columns=drop_cols, inplace=True, errors='ignore')
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user_id  20 non-null     int64  
 1   gender   20 non-null     object 
 2   desc     16 non-null     object 
 3   follows  20 non-null     int64  
 4   fans     20 non-null     float64
dtypes: float64(1), int64(2), object(2)
memory usage: 928.0+ bytes


In [9]:
# 删除多余列
drop_cols = [
    'note_id', 'last_modify_ts', 'note_url', 'ip_location',
    'nickname', 'gender','profile_url','avatar','source_keyword'
]
posts.drop(columns=drop_cols, inplace=True, errors='ignore')
posts.head(2)

Unnamed: 0,content,create_time,create_date_time,liked_count,comments_count,shared_count,user_id
0,卡粉脱妆星人，看这篇！今天是小课堂最后一讲【仪器洁面防晒篇】，记得来听哦！#李佳琦# #李佳...,1746966601,2025-05-11 12:30:01+08:00,464,102,48,1968758563
1,不同需求不同预算，进来对号入座‼预告一下，接下来的小课堂节奏👇【彩妆篇】【仪器洁面防晒篇】，...,1746878400,2025-05-10 12:00:00+08:00,387,85,20,1968758563


In [10]:
posts['shared_count'].describe()

count      9765.000000
mean        349.594060
std        2385.274291
min           0.000000
25%           3.000000
50%          18.000000
75%         218.000000
max      179101.000000
Name: shared_count, dtype: float64

In [11]:
# 生成分类标签（转发量多分类）
def classify_shared_count(x):
    try:
        x = int(x)
        if x <= 3:
            return 0
        elif x <= 18:
            return 1
        elif x <= 218:
            return 2
        else:
            return 3
    except:
        return None
posts['shared_class'] = posts['shared_count'].apply(classify_shared_count)

In [12]:
# 合并：根据 user_id 关联
df = posts.merge(users, on='user_id')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11104 entries, 0 to 11103
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   content           11097 non-null  object 
 1   create_time       11104 non-null  int64  
 2   create_date_time  11104 non-null  object 
 3   liked_count       11104 non-null  int64  
 4   comments_count    11104 non-null  int64  
 5   shared_count      11104 non-null  int64  
 6   user_id           11104 non-null  int64  
 7   shared_class      11104 non-null  int64  
 8   gender            11104 non-null  object 
 9   desc              9346 non-null   object 
 10  follows           11104 non-null  int64  
 11  fans              11104 non-null  float64
dtypes: float64(1), int64(7), object(4)
memory usage: 1.0+ MB


## 文本特征

In [14]:
#文本特征
df['content'] = df['content'].fillna(' ')
df['desc'] = df['desc'].fillna(' ')

In [None]:
import jieba #分词
from tqdm import tqdm
import pandas as pd
from gensim.models.word2vec import Word2Vec 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [16]:
# Word2vec:使用word2vec得到每个词的词向量，将句子所有词取均值得到句子的向量，用来表征样本中的文本
def generate_wv(texts):
    text_list = []
    for text in tqdm(texts):
        text_list.append(list(jieba.cut(text)))
    wv_model= Word2Vec(text_list, min_count=1, vector_size = 10, sg = 1)
    text_embed_list = []
    
    for sentence in text_list:
        word_embed_list = []
        for word in sentence:
            word_embed_list.append(wv_model.wv[word])
        text_embed_list.append(np.array(word_embed_list).mean(0))
    
    return np.array(text_embed_list)

# tfidf-svd：利用稀疏矩阵的方法抽取每个样本的tfidf向量，再利用svd降维得到文本特征
def build_tfidf_svd_matrix(texts, n_output):
    """
    """
    corpus = []
    for text in texts:
        words = list(jieba.cut(str(text)))
        use_words = []
        for word in words:
            use_words.append(word)
        corpus.append(' '.join(use_words))
    tfidf_vec = TfidfVectorizer()
    tfidf_matrix = tfidf_vec.fit_transform(corpus)
    svd = TruncatedSVD(n_components=n_output, n_iter=7, random_state=42)
    tf_idf_svd = svd.fit_transform(tfidf_matrix)
    
    return tf_idf_svd

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
import torch


def classify_reviews(df, text_col='content', output_col='sentiment_class', batch_size=10):
    # 类别映射
    # class_names = {0: "负面", 1: "正面"}
    class_names = {0: 0, 1: 1}
    df[output_col] = None

    # 设备设置（使用GPU如果可用）
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # 加载预训练模型和分词器
    model_name = '../BERT-weibo' 
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name).to(device)
    model.eval()
    
    for i in range(0, len(df), batch_size):
        batch_texts = df[text_col][i:i+batch_size].astype(str).tolist()

        # 编码
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)

        # 推理
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

        # 添加结果
        df.loc[i:i+batch_size-1, output_col] = [class_names[pred.item()] for pred in predictions]


    return df

In [18]:
weibotext_wv_embed = generate_wv(df['content']) 
df['content_wv_embed'] = list(weibotext_wv_embed) #微博内容词向量
user_intro_wv_embed = generate_wv(df['desc'])
df['desc_wv_embed'] = list(user_intro_wv_embed) #用户简介词向量

weibotext_tfidf = build_tfidf_svd_matrix(df['content'], 10)
df['content_tfidf'] = list(weibotext_tfidf) #微博内容tfidf+svd
user_intro_tfidf = build_tfidf_svd_matrix(df['desc'], 10)
df['desc_tfidf'] = list(user_intro_tfidf)#用户简介tfidf+svd

  0%|          | 0/11104 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lyh\AppData\Local\Temp\jieba.cache
Loading model cost 0.488 seconds.
Prefix dict has been built successfully.
100%|██████████| 11104/11104 [00:03<00:00, 3291.34it/s]
100%|██████████| 11104/11104 [00:00<00:00, 11411.58it/s]


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11104 entries, 0 to 11103
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   content           11104 non-null  object 
 1   create_time       11104 non-null  int64  
 2   create_date_time  11104 non-null  object 
 3   liked_count       11104 non-null  int64  
 4   comments_count    11104 non-null  int64  
 5   shared_count      11104 non-null  int64  
 6   user_id           11104 non-null  int64  
 7   shared_class      11104 non-null  int64  
 8   gender            11104 non-null  object 
 9   desc              11104 non-null  object 
 10  follows           11104 non-null  int64  
 11  fans              11104 non-null  float64
 12  content_wv_embed  11104 non-null  object 
 13  desc_wv_embed     11104 non-null  object 
 14  content_tfidf     11104 non-null  object 
 15  desc_tfidf        11104 non-null  object 
dtypes: float64(1), int64(7), object(8)
memor

In [20]:
# 内容长度
df['content_len'] = df['content'].apply(len)

In [21]:
df = classify_reviews(df)
df.info()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11104 entries, 0 to 11103
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   content           11104 non-null  object 
 1   create_time       11104 non-null  int64  
 2   create_date_time  11104 non-null  object 
 3   liked_count       11104 non-null  int64  
 4   comments_count    11104 non-null  int64  
 5   shared_count      11104 non-null  int64  
 6   user_id           11104 non-null  int64  
 7   shared_class      11104 non-null  int64  
 8   gender            11104 non-null  object 
 9   desc              11104 non-null  object 
 10  follows           11104 non-null  int64  
 11  fans              11104 non-null  float64
 12  content_wv_embed  11104 non-null  object 
 13  desc_wv_embed     11104 non-null  object 
 14  content_tfidf     11104 non-null  object 
 15  desc_tfidf        11104 non-null  object 
 16  content_len       11104 non-null  int64 

## 时间特征

In [22]:
from datetime import datetime
from dateutil import parser

def apply_weibo_creatime(x):
    date_obj = pd.to_datetime(x)
    # date_obj = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    return pd.Series({
        'post_day':date_obj.day,
        'post_weekday':date_obj.weekday(),
        'post_month':date_obj.month, 
        'post_hour':date_obj.hour, 
        'post_minute':date_obj.minute,
        'post_year':date_obj.year,
    })

In [23]:
# 时间特征
df_weibo_create_time_feature = df['create_date_time'].apply(apply_weibo_creatime)
df_weibo_create_time_feature.head()

Unnamed: 0,post_day,post_weekday,post_month,post_hour,post_minute,post_year
0,11,6,5,12,30,2025
1,10,5,5,12,0,2025
2,9,4,5,14,53,2025
3,8,3,5,12,30,2025
4,7,2,5,12,31,2025


In [24]:
df = pd.concat([df, df_weibo_create_time_feature], axis=1)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11104 entries, 0 to 11103
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   content           11104 non-null  object 
 1   create_time       11104 non-null  int64  
 2   create_date_time  11104 non-null  object 
 3   liked_count       11104 non-null  int64  
 4   comments_count    11104 non-null  int64  
 5   shared_count      11104 non-null  int64  
 6   user_id           11104 non-null  int64  
 7   shared_class      11104 non-null  int64  
 8   gender            11104 non-null  object 
 9   desc              11104 non-null  object 
 10  follows           11104 non-null  int64  
 11  fans              11104 non-null  float64
 12  content_wv_embed  11104 non-null  object 
 13  desc_wv_embed     11104 non-null  object 
 14  content_tfidf     11104 non-null  object 
 15  desc_tfidf        11104 non-null  object 
 16  content_len       11104 non-null  int64 

In [35]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

In [37]:
# 1. 加载BERT tokenizer和模型
tokenizer = BertTokenizer.from_pretrained('../bert-base-chinese')
model = BertModel.from_pretrained('../bert-base-chinese')
model.eval()  # 关闭dropout等训练特性

# 如果有GPU可用则使用GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 3. 批量提取embedding函数
def get_batch_embeddings(texts, batch_size=8):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        # 编码
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        # 推理
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, 768)
            cls_embeddings = cls_embeddings.cpu().numpy()
            embeddings.extend(cls_embeddings)
    return embeddings

# 4. 获取embedding并存入DataFrame
df['embedding'] = get_batch_embeddings(df['content'].tolist(), batch_size=4)

100%|██████████| 2776/2776 [01:44<00:00, 26.57it/s]


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11104 entries, 0 to 11103
Data columns (total 33 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   content                        11104 non-null  object  
 1   create_time                    11104 non-null  int64   
 2   create_date_time               11104 non-null  object  
 3   liked_count                    11104 non-null  int64   
 4   comments_count                 11104 non-null  int64   
 5   shared_count                   11104 non-null  int64   
 6   user_id                        11104 non-null  category
 7   shared_class                   11104 non-null  int64   
 8   gender                         11104 non-null  object  
 9   desc                           11104 non-null  object  
 10  follows                        11104 non-null  int64   
 11  fans                           11104 non-null  float64 
 12  content_wv_embed               1

In [39]:
# 5. 可选保存
import pickle
with open('bert_data.pkl', 'wb') as f:
    pickle.dump(df, f)