In [1]:
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
import pickle
import numpy as np

## 导入词嵌入模型 

In [6]:
with open('./embedding_models/bitcoin_jieba.pkl', 'rb') as fr:
    data = pickle.load(fr)
    bitcoin_jieba_embeddings = data['embeddings']
with open('./embedding_models/sohu_jieba.pkl', 'rb') as fr:
    data = pickle.load(fr)
    sohu_jieba_embeddings = data['embeddings']


## 指定主题的向量打上1标签，随机主题的0标签 

In [7]:
X = []
Y = []
for b in bitcoin_jieba_embeddings:
    X.append(bitcoin_jieba_embeddings[b])
    Y.append('1')
for s in sohu_jieba_embeddings:
    X.append(sohu_jieba_embeddings[s])
    Y.append('0')
X, Y = np.array(X), np.array(Y)

## 不调参数，直接训练 

In [9]:
(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, train_size=0.7, random_state=0)
model = XGBClassifier()
model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [10]:
Y_train_pred = model.predict(X_train)
train_is_match = [ int(i==j) for i in Y_train for j in Y_train_pred ]
print(sum(train_is_match)/len(train_is_match))

Y_test_pred = model.predict(X_test)
test_is_match = [ int(i==j) for i in Y_test for j in Y_test_pred ]
print(sum(test_is_match)/len(test_is_match))

0.7260046602189665
0.7521545090797168


## 调参数后训练 (过程已省略) 

In [13]:
(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, train_size=0.7, random_state=0)
model = XGBClassifier(
    learning_rate =0.05,
    n_estimators=840,
    max_depth=4,
    min_child_weight=1,
    subsample=0.6,
    colsample_bytree=0.6,
    nthread=4,
    scale_pos_weight=0.001,
    seed=27,
    max_delta_step=2
)
model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.05, max_delta_step=2, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=840, n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=27, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=0.001, seed=27, subsample=0.6,
              tree_method=None, validate_parameters=False, verbosity=None)

In [14]:
Y_train_pred = model.predict(X_train)
train_is_match = [ int(i==j) for i in Y_train for j in Y_train_pred ]
print(sum(train_is_match)/len(train_is_match))

Y_test_pred = model.predict(X_test)
test_is_match = [ int(i==j) for i in Y_test for j in Y_test_pred ]
print(sum(test_is_match)/len(test_is_match))

0.8361581920903954
0.8304093567251462
