In [1]:
import numpy as np
import pandas as pd
import joblib

# Model running

## 1. Heimin extract models
- features
    - article_vectors (appearance)
    - au_score (similarity)
    - tfidf_cosine_once_keyword
    - sim_hash

In [2]:
# read in train x data & train y labels & test x data
X1 = np.load("heimin_extract_train_x.npy")
Y1 = np.load("heimin_extract_train_y.npy")
X2 = np.load("heimin_extract_test_x.npy")

In [3]:
# encode labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(Y1)
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [4]:
# split train set & test set
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.3, random_state=1)

In [5]:
# use XGBoost to classify
from sklearn.utils import class_weight
from xgboost.sklearn import XGBClassifier
train_sample = class_weight.compute_sample_weight('balanced', y1_train)
test_sample = class_weight.compute_sample_weight('balanced', y1_test)
clf3 =  XGBClassifier(seed=1)
clf3.fit(X1_train, y1_train, sample_weight=train_sample)# predict
y1_pred = clf3.predict(X1_test)



              precision    recall  f1-score   support

           0     0.9998    0.9987    0.9992     93157
           1     0.7663    0.9524    0.8493       420

    accuracy                         0.9985     93577
   macro avg     0.8830    0.9755    0.9242     93577
weighted avg     0.9987    0.9985    0.9986     93577

              precision    recall  f1-score   support

           0     0.9545    0.9987    0.9761 46788.499999922635
           1     0.9986    0.9524    0.9750 46788.50000000031

    accuracy                         0.9755 93576.99999992295
   macro avg     0.9766    0.9755    0.9755 93576.99999992295
weighted avg     0.9766    0.9755    0.9755 93576.99999992295



In [6]:
# predict
y1_pred = clf3.predict(X1_test)

In [7]:
# accuracy score for the result
from sklearn.metrics import accuracy_score
accuracy_score(y_true=y1_test, y_pred=y1_pred, sample_weight=test_sample)

0.9755356676414673

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y1_test, y_pred=y1_pred, digits=4))
print(classification_report(y_true=y1_test, y_pred=y1_pred, digits=4, sample_weight=test_sample))

              precision    recall  f1-score   support

           0     0.9998    0.9987    0.9992     93157
           1     0.7663    0.9524    0.8493       420

    accuracy                         0.9985     93577
   macro avg     0.8830    0.9755    0.9242     93577
weighted avg     0.9987    0.9985    0.9986     93577

              precision    recall  f1-score   support

           0     0.9545    0.9987    0.9761 46788.499999922635
           1     0.9986    0.9524    0.9750 46788.50000000031

    accuracy                         0.9755 93576.99999992295
   macro avg     0.9766    0.9755    0.9755 93576.99999992295
weighted avg     0.9766    0.9755    0.9755 93576.99999992295



In [10]:
# predict stage 2
Y2 = clf3.predict(X2)

In [18]:
# read in stage 2 labels for final double check

stage2_res_df =  pd.read_csv("stage2_result_labels.csv")
stage2_results = pd.Series(Y2)
stage2_res_df['Result'] = stage2_results
stage2_res_df = stage2_res_df.loc[stage2_res_df['Result'] == 1]
stage2_res_df

Unnamed: 0,Test,Reference,Similarity,Result
26,1001,1064,0.75,1
155,1001,19,1.00,1
382,1001,889,1.00,1
1545,1008,573,1.00,1
1546,1008,574,1.00,1
...,...,...,...,...
171176,960,426,1.00,1
171195,960,464,0.75,1
171323,960,889,1.00,1
174945,991,407,1.00,1


In [31]:
# filter some ugly answers out
stage2_res_df = stage2_res_df.loc[stage2_df.Similarity >= 0.78]
stage2_res_df

Unnamed: 0,Test,Reference,Similarity,Result
155,1001,19,1.0,1
382,1001,889,1.0,1
1545,1008,573,1.0,1
1546,1008,574,1.0,1
1703,1009,1061,1.0,1
...,...,...,...,...
171094,960,177,1.0,1
171176,960,426,1.0,1
171323,960,889,1.0,1
174945,991,407,1.0,1


In [30]:
# output results
stage2_res_df[["Test", "Reference"]].to_csv("csv_results/XBOOST_heimin_extract_78.csv", index=False)

In [34]:
# save the model
filename = '1_XGBOST_feature_extract_model.sav'
joblib.dump(clf3, filename)

['XBOOST_hemin_extract_model.sav']

In [3]:
filename = '1_XGBOST_feature_extract_model.sav'
loaded_model = joblib.load(filename)
loaded_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=2,
              num_parallel_tree=1, predictor='auto', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

## ~2~. Only similarities in XGBoost models (not good, don't use)
- features order
    - similarity(au_score), jaccard_once, jaccard_multi, tf_cosine_once_keyword, tfidf_cosine_once_keyword, tf_cosine_multi_keyword, tfidf_cosine_multi_keyword, sim_hash

In [10]:
# read in train x data & train y labels & test x data
X1 = np.load("../FeatureExtracting/X1_only_similarities.npy")
Y1 = np.load("../FeatureExtracting/Y1.npy")
X2 = np.load("../FeatureExtracting/X2_only_similarities.npy")

In [12]:
len(X2)

176820

In [13]:
# encode labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(Y1)
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [14]:
# split train set & test set
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.3, random_state=1)

In [15]:
# use XGBoost to classify
from sklearn.utils import class_weight
from xgboost.sklearn import XGBClassifier
train_sample = class_weight.compute_sample_weight('balanced', y1_train)
test_sample = class_weight.compute_sample_weight('balanced', y1_test)
clf3 =  XGBClassifier(seed=1)
clf3.fit(X1_train, y1_train, sample_weight=train_sample)# predict
y1_pred = clf3.predict(X1_test)





In [16]:
# predict
y1_pred = clf3.predict(X1_test)

In [17]:
# accuracy score for the result
from sklearn.metrics import accuracy_score
accuracy_score(y_true=y1_test, y_pred=y1_pred, sample_weight=test_sample)

0.9250848158536251

In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y1_test, y_pred=y1_pred, digits=4))
print(classification_report(y_true=y1_test, y_pred=y1_pred, digits=4, sample_weight=test_sample))

              precision    recall  f1-score   support

           0     0.9993    0.9978    0.9986     93157
           1     0.6348    0.8524    0.7276       420

    accuracy                         0.9971     93577
   macro avg     0.8170    0.9251    0.8631     93577
weighted avg     0.9977    0.9971    0.9973     93577

              precision    recall  f1-score   support

           0     0.8711    0.9978    0.9302 46788.499999922635
           1     0.9974    0.8524    0.9192 46788.50000000031

    accuracy                         0.9251 93576.99999992295
   macro avg     0.9343    0.9251    0.9247 93576.99999992295
weighted avg     0.9343    0.9251    0.9247 93576.99999992295



In [19]:
# predict stage 2
Y2 = clf3.predict(X2)

In [25]:
# read in stage 2 labels for final double check

stage2_res_df =  pd.read_csv("stage2_result_labels.csv")
stage2_results = pd.Series(Y2)
stage2_res_df['Result'] = stage2_results
stage2_res_df = stage2_res_df.loc[stage2_res_df['Result'] == 1]
stage2_res_df

Unnamed: 0,Test,Reference,Similarity,Result
155,1001,19,1.000000,1
382,1001,889,1.000000,1
406,1001,960,0.750000,1
1354,1008,128,1.000000,1
1545,1008,573,1.000000,1
...,...,...,...,...
174736,991,1036,0.666667,1
175109,991,908,1.000000,1
175135,991,990,1.000000,1
176162,996,269,0.923077,1


In [30]:
# filter some ugly answers out
stage2_res_df = stage2_res_df.loc[stage2_res_df.Similarity >= 0.78]
stage2_res_df

Unnamed: 0,Test,Reference,Similarity,Result
155,1001,19,1.000000,1
382,1001,889,1.000000,1
1354,1008,128,1.000000,1
1545,1008,573,1.000000,1
1546,1008,574,1.000000,1
...,...,...,...,...
171323,960,889,1.000000,1
175109,991,908,1.000000,1
175135,991,990,1.000000,1
176162,996,269,0.923077,1


In [31]:
# output results
stage2_res_df[["Test", "Reference"]].to_csv("csv_results/XBOOST_similarities_78.csv", index=False)

In [32]:
# save the model
filename = 'XBOOST_similarities_78.sav'
joblib.dump(clf3, filename)

['XBOOST_similarities_78.sav']

## 5. XGBoost feature times models
- features order
    - similarity(au_score), jaccard_once, jaccard_multi, tf_cosine_once_keyword, tfidf_cosine_once_keyword, tf_cosine_multi_keyword, tfidf_cosine_multi_keyword, sim_hash
    - article_vectors (times)

In [33]:
# read in train x data & train y labels & test x data
X1 = np.load("../FeatureExtracting/X1_all.npy")
Y1 = np.load("../FeatureExtracting/Y1.npy")
X2 = np.load("../FeatureExtracting/X2_all.npy")

In [35]:
len(X2)

176820

In [36]:
# encode labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(Y1)
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [37]:
# split train set & test set
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.3, random_state=1)

In [38]:
# use XGBoost to classify
from sklearn.utils import class_weight
from xgboost.sklearn import XGBClassifier
train_sample = class_weight.compute_sample_weight('balanced', y1_train)
test_sample = class_weight.compute_sample_weight('balanced', y1_test)
clf3 =  XGBClassifier(seed=1)
clf3.fit(X1_train, y1_train, sample_weight=train_sample)# predict
y1_pred = clf3.predict(X1_test)





In [39]:
# predict
y1_pred = clf3.predict(X1_test)

In [40]:
# accuracy score for the result
from sklearn.metrics import accuracy_score
accuracy_score(y_true=y1_test, y_pred=y1_pred, sample_weight=test_sample)

0.9613089934708278

In [41]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y1_test, y_pred=y1_pred, digits=4))
print(classification_report(y_true=y1_test, y_pred=y1_pred, digits=4, sample_weight=test_sample))

              precision    recall  f1-score   support

           0     0.9997    0.9988    0.9992     93157
           1     0.7776    0.9238    0.8444       420

    accuracy                         0.9985     93577
   macro avg     0.8886    0.9613    0.9218     93577
weighted avg     0.9987    0.9985    0.9985     93577

              precision    recall  f1-score   support

           0     0.9291    0.9988    0.9627 46788.499999922635
           1     0.9987    0.9238    0.9598 46788.50000000031

    accuracy                         0.9613 93576.99999992295
   macro avg     0.9639    0.9613    0.9613 93576.99999992295
weighted avg     0.9639    0.9613    0.9613 93576.99999992295



In [42]:
# predict stage 2
Y2 = clf3.predict(X2)

In [43]:
# read in stage 2 labels for final double check

stage2_res_df =  pd.read_csv("stage2_result_labels.csv")
stage2_results = pd.Series(Y2)
stage2_res_df['Result'] = stage2_results
stage2_res_df = stage2_res_df.loc[stage2_res_df['Result'] == 1]
stage2_res_df

Unnamed: 0,Test,Reference,Similarity,Result
155,1001,19,1.0,1
382,1001,889,1.0,1
1354,1008,128,1.0,1
1545,1008,573,1.0,1
1546,1008,574,1.0,1
...,...,...,...,...
175684,994,1353,0.0,1
175908,994,765,0.0,1
175911,994,772,0.0,1
175917,994,80,0.0,1


In [51]:
# some double check
stage2_res_df.loc[stage2_res_df.Similarity >= 0.78]

Unnamed: 0,Test,Reference,Similarity,Result
155,1001,19,1.0,1
382,1001,889,1.0,1
1354,1008,128,1.0,1
1545,1008,573,1.0,1
1546,1008,574,1.0,1
...,...,...,...,...
171094,960,177,1.0,1
171323,960,889,1.0,1
174945,991,407,1.0,1
175016,991,615,1.0,1


In [52]:
# filter some ugly answers out
stage2_res_df = stage2_res_df.loc[stage2_res_df.Similarity >= 0.78]
stage2_res_df

Unnamed: 0,Test,Reference,Similarity,Result
155,1001,19,1.0,1
382,1001,889,1.0,1
1354,1008,128,1.0,1
1545,1008,573,1.0,1
1546,1008,574,1.0,1
...,...,...,...,...
171094,960,177,1.0,1
171323,960,889,1.0,1
174945,991,407,1.0,1
175016,991,615,1.0,1


In [53]:
# output results
stage2_res_df[["Test", "Reference"]].to_csv("csv_results/XBOOST_feature_times_78.csv", index=False)

In [54]:
# save the model
filename = '5_XGBOST_feature_times_model.sav'
joblib.dump(clf3, filename)

['5_XGBOST_feature_times_model.sav']