In [1]:
import pandas as pd
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics  import accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 绘制模型表现图
def performance(y_true , predict , color = "g" , ann = True):
    acc = accuracy_score(y_true , predict[:] > 0.5)
    auc = roc_auc_score(y_true , predict[:])
    fpr , tpr , thr = roc_curve(y_true , predict[:])
    plt.figure()
    plt.plot(fpr , tpr )

# 读取数据

In [26]:
df_train = pd.read_csv("simplifyweibo_4_moods_training.csv",sep=',')
df_validation = pd.read_csv("simplifyweibo_4_moods_validation.csv",sep=',')
train_X = df_train['content']
train_Y = df_train['label']
print(train_Y)
validate_X = df_validation['content']
validate_Y = df_validation['label']
print(validate_X)

0       0
1       3
2       0
3       0
4       0
5       2
6       1
7       1
8       1
9       0
10      3
11      0
12      0
13      2
14      0
15      1
16      3
17      0
18      1
19      3
20      3
21      0
22      0
23      2
24      0
25      3
26      0
27      0
28      0
29      1
       ..
6370    0
6371    0
6372    0
6373    0
6374    1
6375    1
6376    1
6377    1
6378    3
6379    0
6380    3
6381    0
6382    3
6383    0
6384    0
6385    1
6386    0
6387    1
6388    0
6389    0
6390    0
6391    0
6392    2
6393    3
6394    0
6395    1
6396    1
6397    0
6398    0
6399    0
Name: label, Length: 6400, dtype: int64
0      椰丝！别和小人过不去，因为他本来就过不去；别和社会过不去，因为你会过不去；别和自己过不去，因...
1                               【爆料】这人父母怎么给他起的名字？囧 。。。。。
2                         你是风儿.....我是沙.....紫薇，我是尔康，我爸是李刚
3                        享受在宿舍听着歌看着书本的那一刻宁静。大家这时候都在干什么呢？
4      确实强悍!!【大开眼界】这个视频真给力~ 新加坡“南洋理工华乐团”演奏的。音乐响起的那一刻，...
5      一口烟进入肺的过程...仅仅30秒，刺激你的视觉。。。当你吸烟的时候，你和你身边的人的肺就变...
6      早

# 预处理

In [30]:
import jieba
import string
def clean_CN(corpus):
    stop = []
    with open(r"D:\大创项目\LDA\stopwords\CNstopwords.txt", 'r', encoding='utf-8') as f:
        for lines in f:
            stop.append(lines.strip())
    stop = set(stop)
   
    exclude = set(string.punctuation)  # 标点符号
    clean_corpus = []
    for doc in corpus:
        words = jieba.lcut(doc)
        stop_free = [i for i in words if (i not in stop) & (i.isalpha())]
        clean_corpus.append(stop_free)
    return clean_corpus

In [31]:
train_X = clean_CN(train_X)
validate_X = clean_CN(validate_X)

In [32]:
print(train_X)

[['找', '朱桢', '丹丹', '担任', '司仪', '美容', '划不上', '号', '怪怪的', '搞', '娘舅', '节目', '牛尔', '各大', '颁奖典礼', '现场', '必到', '人物', '貌似', '不入流', '典礼', '走过场', '端', '奖杯', '来张', '照片'], ['超级', '小伪娘', '伪娘', '娃娃', '抓起'], ['缺钙', '长大', '缺爱', '姥姥', '不疼', '舅舅', '不爱', '左脸', '欠', '抽', '右脸', '欠', '踹', '驴', '驴', '踢', '猪见', '猪', '踩', '天生', '属', '黄瓜', '欠', '拍', '后天', '属', '核桃', '欠', '捶', '终生', '属破', '摩托', '欠', '踹'], ['小米', '晒', '奖品', '速度', '很快', '支持', 'IPS', '硬屏', '呦', 'IPS', '硬屏', '寄', '背包', '收到', '不错', '背包', '很大', '结实', '谢谢', '咯'], ['走', '走', '忘记', '走', '想着', '想着', '忘记', '想', '看着', '看着', '忘记', '感觉', '一种', '迷失', '当初', '执拗', '固执', 'So', 'what'], ['感冒', '心疼', '感冒', '麻', '陪', 'Live', '感冒', '骚虾', '依然', '好听'], ['难', '追', '处女座', '每日', '星座', '时间', '第一名', '处女座', '第二名', '天蝎座', '第三名', '摩', '羯', '座', '第四名', '双子座', '第五名', '水瓶座', '第六名', '狮子座', '第七名', '金牛座', '第八名', '射手座', '第九名', '天秤座', '第十名', '巨蟹座', '第十一名', '白羊座', '第十二名', '双鱼座', '垫底', 'Fay'], ['激死', '一早', '翻', '屋企', '开', '电脑', '唔', '网', '重启', '次', '唔', '希望', '听日', '翻', '喇', '听', '日', 

In [33]:
from gensim.models import word2vec

model = word2vec.Word2Vec(train_X,min_count=3,iter=20)
model.save("word2vec.model")

In [35]:
# 平均词向量模型
import numpy as np


def averaged_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype='float64')
    nwords = 0.
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector


def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [averaged_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in
                corpus]
    return np.array(features)

In [37]:
# 平均词向量模型
avg_wv_train_features = averaged_word_vectorizer(corpus=train_X, model=model, num_features=100)
avg_wv_test_features = averaged_word_vectorizer(corpus=validate_X, model=model, num_features=100)
print('词向量模型训练完毕')

词向量模型训练完毕


# 模型训练和评估

In [41]:
# 作出混淆矩阵
from sklearn import metrics
def get_metrics(true_labels, predicted_labels):
    print('Accuracy' + str(np.round(metrics.accuracy_score(true_labels, predicted_labels), 2)))
    print('Percision' + str(np.round(metrics.precision_score(true_labels, predicted_labels, average='weighted'), 2)))
    print('Recall' + str(np.round(metrics.recall_score(true_labels, predicted_labels, average='weighted'), 2)))
    print('F1 Score' + str(np.round(metrics.f1_score(true_labels, predicted_labels, average='weighted'), 2)))


# # 定义函数使用机器学习算法训练模型


def train_predict_evaluate_model(classifier, train_features, train_labels, test_features, test_labels):
    # 使用分类器训练数据
    model = classifier.fit(train_features, train_labels)
    # 使用训练好的模型对测试集进行预测
    predictions = classifier.predict(test_features)
    # 对模型表现进行评估
    get_metrics(true_labels=test_labels, predicted_labels=predictions)
    return predictions,model


from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier(loss='hinge', n_iter_no_change=30)
svm_avgwv_predictions,svm_argwv_model=train_predict_evaluate_model(svm,avg_wv_train_features,train_Y,avg_wv_test_features,validate_Y)
report = classification_report(svm_avgwv_predictions,validate_Y)
print(report)

Accuracy0.55
Percision0.38
Recall0.55
F1 Score0.4
              precision    recall  f1-score   support

           0       0.98      0.55      0.71       780
           1       0.01      0.33      0.02         3
           2       0.00      0.00      0.00         2
           3       0.03      0.20      0.05        15

    accuracy                           0.55       800
   macro avg       0.25      0.27      0.19       800
weighted avg       0.96      0.55      0.69       800



In [49]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=0,n_estimators=100,max_depth=10)
rfc_avgwv_predictions,rfc_argwv_model=train_predict_evaluate_model(rfc,avg_wv_train_features,train_Y,avg_wv_test_features,validate_Y)
report = classification_report(rfc_avgwv_predictions,validate_Y)
print(report)


Accuracy0.56
Percision0.52
Recall0.56
F1 Score0.41
              precision    recall  f1-score   support

           0       0.99      0.56      0.72       781
           1       0.01      0.50      0.02         2
           2       0.02      0.50      0.03         4
           3       0.04      0.38      0.08        13

    accuracy                           0.56       800
   macro avg       0.27      0.49      0.21       800
weighted avg       0.97      0.56      0.70       800



In [53]:
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=15,
                      learning_rate=0.1,
                      n_estimators=2000,
                      min_child_weight=5,
                      max_delta_step=0,
                      subsample=0.8,
                      colsample_bytree=0.7,
                      reg_alpha=0,
                      reg_lambda=0.4,
                      scale_pos_weight=0.8,
                      silent=True,
                      objective='binary:logistic',
                      missing=None,
                      eval_metric='auc',
                      seed=1440,
                      gamma=0)
xgb_avgwv_predictions,xgb_argwv_model=train_predict_evaluate_model(xgb,avg_wv_train_features,train_Y,avg_wv_test_features,validate_Y)
report = classification_report(xgb_avgwv_predictions,validate_Y)
print(report)


Accuracy0.52
Percision0.43
Recall0.52
F1 Score0.44
              precision    recall  f1-score   support

           0       0.88      0.58      0.70       667
           1       0.07      0.24      0.10        34
           2       0.07      0.26      0.11        34
           3       0.14      0.25      0.18        65

    accuracy                           0.52       800
   macro avg       0.29      0.33      0.27       800
weighted avg       0.75      0.52      0.60       800



In [None]:
df_test = pd.read_csv("simplifyweibo_4_moods_test.csv",sep=',')
test_X = df_train['content']
test_X = clean_CN(test_X)
avg_wv_test_features = averaged_word_vectorizer(corpus=test_X, model=model, num_features=100)
predictions = svm_tfidf_model.predict(avg_wv_test_features).tolist()
print(predictions)
