In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier

## 3. Negative classifier

In [None]:
bert_model = SentenceTransformer('bert_negative_model')

label_df = pd.read_csv('labelled_data/all_labelled_jobs.csv')
label_df = label_df[~label_df['clean_job_description'].isna()].reset_index()
code2label = {
    'YES': 1,
    'YES?': 1,
    'VENDOR?': 1,
    'YES/VENDOR': 1,
    'MAYBE': 0,
    'NO?': 0,
    'MAYBE': 0,
    'VENDOR': 1,
    'NO': 0
}
label_df['label'] = label_df['COMPLIANCE?'].map(code2label)
label_df['label'] = label_df['label'].fillna(0)

### Step 1. train XGBoost

In [None]:
X = bert_model.encode(label_df['clean_job_description'])
y = label_df['label']

n_repeat = 10
xgboost_models = []
print('training models...')
for i in tqdm(range(n_repeat), total=n_repeat):
    model = XGBClassifier()
    undersampler = RandomUnderSampler()
    X_sub, y_sub = undersampler.fit_resample(X, y)
    model.fit(X_sub, y_sub)
    xgboost_models.append(model)

### Step 2. Apply XGBoost

In [None]:
THRESHOLD = 0.8

print('use models...')
negative_df = pd.read_csv('data_for_filtering/positive_identifier_v2_07252022.csv')
negative_df = negative_df[(~negative_df['clean_job_description'].isna()) & (negative_df['positive_identifier_v2_label'] == 1)].reset_index()
new_X = bert_model.encode(negative_df['clean_job_description'])
negative_df['negative_predict'] = 0
for i in range(n_repeat):
    negative_df['negative_predict'] += xgboost_models[i].predict(new_X)

negative_df['negative_predict'] /= (n_repeat)
negative_df['negative_identifier_label'] = negative_df['negative_predict'] >= THRESHOLD
print('negative_df.shape:', negative_df.shape)
print('results:', negative_df['negative_identifier_label'].value_counts())
negative_df.to_csv('data_for_filtering/negative_identifier_v3_TEST.csv', index=False)
# negative_df.to_csv('data_for_filtering/negative_identifier_v3_08032022.csv', index=False)