In [3]:
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from sentence_transformers import SentenceTransformer

## 2. Positive classifier

In [6]:
full_df = pd.read_csv('../data_for_filtering/job_with_compliance_keywords.csv')
positive_df = full_df[['label', 'clean_job_description']][full_df['label'] == 1]
negative_df = pd.read_csv('../negative_identifier_output_6232022.csv')
job_df = pd.concat([positive_df, negative_df]).reset_index()

label_df = full_df[(~full_df['clean_job_description'].isna()) & (full_df['label'] == 0)].reset_index()

In [None]:
tqdm.pandas()

n_features = 100
print('n features:', n_features)
tf_idf = TfidfVectorizer(max_features=n_features)
bert_model = SentenceTransformer('all-MiniLM-L6-v2')  # paraphrase-MiniLM-L3-v2; all-mpnet-base-v2; all-MiniLM-L12-v2; https://sbert.net/docs/pretrained_models.html
print('start encoding X...')
X = bert_model.encode(job_df['clean_job_description'])
y = job_df['label']
X_1 = tf_idf.fit_transform(job_df['clean_job_description'])


n_repeat = 10
bert_models = []
print('training models...')
for i in tqdm(range(n_repeat), total=n_repeat):
    model = XGBClassifier()
    undersampler = RandomUnderSampler()
    X_sub, y_sub = undersampler.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_sub, y_sub, test_size=0.5)
    model.fit(X_train, y_train)
    bert_models.append(model)
tfidf_models = []
for i in tqdm(range(n_repeat), total=n_repeat):
    model = XGBClassifier()
    undersampler = RandomUnderSampler()
    X_sub, y_sub = undersampler.fit_resample(X_1, y)
    X_train, X_test, y_train, y_test = train_test_split(X_sub, y_sub, test_size=0.5)
    model.fit(X_train, y_train)
    tfidf_models.append(model)

print('start encoding X...')
new_X = bert_model.encode(label_df['clean_job_description'])
new_X_1 = tf_idf.fit_transform(label_df['clean_job_description'])

label_df['y_predict'] = 0
for i in range(n_repeat):
    label_df['y_predict'] += bert_models[i].predict(new_X)
for i in range(n_repeat):
    label_df['y_predict'] += tfidf_models[i].predict(new_X_1)
label_df['y_predict'] /= (n_repeat*2)
label_df['positive_identifier_v2_label'] = label_df['y_predict'] >= 0.3
label_df.to_csv('data_for_filtering/positive_identifier_v2_TEST.csv', index=False)
# label_df.to_csv('data_for_filtering/positive_identifier_v2_07252022.csv', index=False)
print('label_df shape:', label_df.shape, 'prediction results:', label_df['positive_identifier_v2_label'].value_counts())

n features: 100
start encoding X...
