In [1]:
import numpy as np
import pandas as pd
import joblib

# Model running

## 1. Heimin extract models
- features
    - article_vectors (appearance)
    - au_score (similarity)
    - tfidf_cosine_once_keyword
    - sim_hash

In [2]:
# read in train x data & train y labels & test x data
X1 = np.load("heimin_extract_train_x.npy")
Y1 = np.load("heimin_extract_train_y.npy")
X2 = np.load("heimin_extract_test_x.npy")

In [3]:
# encode labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(Y1)
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [4]:
# split train set & test set
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.3, random_state=1)

In [5]:
# use XGBoost to classify
from sklearn.utils import class_weight
from xgboost.sklearn import XGBClassifier
train_sample = class_weight.compute_sample_weight('balanced', y1_train)
test_sample = class_weight.compute_sample_weight('balanced', y1_test)
clf3 =  XGBClassifier(seed=1)
clf3.fit(X1_train, y1_train, sample_weight=train_sample)# predict
y1_pred = clf3.predict(X1_test)



              precision    recall  f1-score   support

           0     0.9998    0.9987    0.9992     93157
           1     0.7663    0.9524    0.8493       420

    accuracy                         0.9985     93577
   macro avg     0.8830    0.9755    0.9242     93577
weighted avg     0.9987    0.9985    0.9986     93577

              precision    recall  f1-score   support

           0     0.9545    0.9987    0.9761 46788.499999922635
           1     0.9986    0.9524    0.9750 46788.50000000031

    accuracy                         0.9755 93576.99999992295
   macro avg     0.9766    0.9755    0.9755 93576.99999992295
weighted avg     0.9766    0.9755    0.9755 93576.99999992295



In [6]:
# predict
y1_pred = clf3.predict(X1_test)

In [7]:
# accuracy score for the result
from sklearn.metrics import accuracy_score
accuracy_score(y_true=y1_test, y_pred=y1_pred, sample_weight=test_sample)

0.9755356676414673

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y1_test, y_pred=y1_pred, digits=4))
print(classification_report(y_true=y1_test, y_pred=y1_pred, digits=4, sample_weight=test_sample))

              precision    recall  f1-score   support

           0     0.9998    0.9987    0.9992     93157
           1     0.7663    0.9524    0.8493       420

    accuracy                         0.9985     93577
   macro avg     0.8830    0.9755    0.9242     93577
weighted avg     0.9987    0.9985    0.9986     93577

              precision    recall  f1-score   support

           0     0.9545    0.9987    0.9761 46788.499999922635
           1     0.9986    0.9524    0.9750 46788.50000000031

    accuracy                         0.9755 93576.99999992295
   macro avg     0.9766    0.9755    0.9755 93576.99999992295
weighted avg     0.9766    0.9755    0.9755 93576.99999992295



In [10]:
# predict stage 2
Y2 = clf3.predict(X2)

In [18]:
# read in stage 2 labels for final double check

stage2_res_df =  pd.read_csv("stage2_result_labels.csv")
stage2_results = pd.Series(Y2)
stage2_res_df['Result'] = stage2_results
stage2_res_df = stage2_res_df.loc[stage2_res_df['Result'] == 1]
stage2_res_df

Unnamed: 0,Test,Reference,Similarity,Result
26,1001,1064,0.75,1
155,1001,19,1.00,1
382,1001,889,1.00,1
1545,1008,573,1.00,1
1546,1008,574,1.00,1
...,...,...,...,...
171176,960,426,1.00,1
171195,960,464,0.75,1
171323,960,889,1.00,1
174945,991,407,1.00,1


In [31]:
# filter some ugly answers out
stage2_res_df = stage2_res_df.loc[stage2_df.Similarity >= 0.78]
stage2_res_df

Unnamed: 0,Test,Reference,Similarity,Result
155,1001,19,1.0,1
382,1001,889,1.0,1
1545,1008,573,1.0,1
1546,1008,574,1.0,1
1703,1009,1061,1.0,1
...,...,...,...,...
171094,960,177,1.0,1
171176,960,426,1.0,1
171323,960,889,1.0,1
174945,991,407,1.0,1


In [30]:
# output results
stage2_res_df[["Test", "Reference"]].to_csv("csv_results/XBOOST_heimin_extract_78.csv", index=False)

In [34]:
# save the model
filename = '1_XGBOST_feature_extract_model.sav'
joblib.dump(clf3, filename)

['XBOOST_hemin_extract_model.sav']

In [3]:
filename = '1_XGBOST_feature_extract_model.sav'
loaded_model = joblib.load(filename)
loaded_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=2,
              num_parallel_tree=1, predictor='auto', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)