## BERT + XGBoost

In [None]:
def train_test_split(pos_matrix, neg_matrix, train_prop=0.7, repeats=6):
    '''
    repeats:训练集中，正例的复制倍数
    '''
    import numpy as np
    #shuffle
    np.random.shuffle(pos_matrix)
    np.random.shuffle(neg_matrix)
    #split data
    pos_train_size=int(pos_matrix.shape[0]*train_prop)
    neg_train_size=int(neg_matrix.shape[0]*train_prop)

    train_pos=pos_matrix[:pos_train_size]
    train_neg=neg_matrix[:neg_train_size]
    train_pos=np.tile(train_pos,(repeats,1))
    test_pos=pos_matrix[pos_train_size:]
    test_neg=neg_matrix[neg_train_size:]
    #test_pos=np.tile(test_pos,(6,1))

    x_train=np.vstack((train_pos,train_neg))
    y_train=np.vstack((np.ones((len(train_pos),1)),np.zeros((len(train_neg),1))))
    x_test=np.vstack((test_pos,test_neg))
    y_test=np.vstack((np.ones((len(test_pos),1)),np.zeros((len(test_neg),1))))
    return x_train, y_train, x_test, y_test

def get_metrics(pre_y,y):
    auc_score = roc_auc_score(y,pre_y)
    pre_score = precision_score(y,pre_y)
    rec_score=recall_score(y,pre_y)
    f_score=f1_score(y,pre_y)

    print("xgb_auc_score:",auc_score)
    print("xgb_pre_score:",pre_score)
    print("xgb_rec_score:",rec_score)
    print("xgb_f1_score:",f_score)
    return auc_score, pre_score, rec_score, f_score

In [None]:
import numpy as np
import sklearn
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import precision_score,roc_auc_score,recall_score,f1_score

#load data
pos_matrix=np.load("data/bert/matrix/pos_matrix.npy")
neg_matrix=np.load("data/bert/matrix/neg_matrix.npy")
x_train, y_train, x_test, y_test=train_test_split(pos_matrix, neg_matrix, train_prop=0.7)
print("Data processing completed")

xgbc = XGBClassifier()
xgbc.fit(x_train,y_train)
print("xgboost training completed")

In [None]:
pre_train = xgbc.predict(x_train)
get_metrics(pre_train,y_train)
print("----------")
pre_test = xgbc.predict(x_test)
get_metrics(pre_test,y_test)

## word2vec + XGBoost

In [None]:
import numpy as np
import sklearn
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import precision_score,roc_auc_score,recall_score,f1_score
from gensim.models import Word2Vec  

#load data
pos_matrix=np.zeros((0,256))
neg_matrix=np.zeros((0,256))
model=Word2Vec.load("model/word2vec/word2vec")
print("word2vec model loaded.")
with open("data/samples/positive.txt", encoding='utf8') as f:
    for line in f:
        words=[w for w in line.split() if w in model]
        if not words:
            continue
        pos_matrix=np.vstack((pos_matrix,model.wv[words].mean(0)))
print("positive matrix completed.")
with open("data/samples/negative.txt", encoding='utf8') as f:
    for line in f:
        words=[w for w in line.split() if w in model]
        if not words:
            continue
        neg_matrix=np.vstack((neg_matrix,model.wv[words].mean(0)))
print("negative matrix completed.")
x_train, y_train, x_test, y_test=train_test_split(pos_matrix, neg_matrix, train_prop=0.7,repeats=4)
print("Data processing completed")

xgbc = XGBClassifier()
xgbc.fit(x_train,y_train)
print("xgboost training completed")
xgbc.save_model("model/xgb/xgb.word2vec.model")

In [None]:
pre_train = xgbc.predict(x_train)
get_metrics(pre_train,y_train)
print("----------")
pre_test = xgbc.predict(x_test)
get_metrics(pre_test,y_test)

In [None]:
#svm 效果不如xgboost
from sklearn.svm import LinearSVC

clf=LinearSVC()
clf.fit(X=x_train,y=y_train)

pre_train=clf.predict(x_train)
get_metrics(pre_train,y_train)
print("----------")
pre_test = clf.predict(x_test)
get_metrics(pre_test,y_test)