In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd
import numpy as np
import torch
import transformers as ppb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import ast


In [2]:
df = pd.read_csv('Scraping_Data_1000.csv')
work_model_mapping = {
    'Remote': 2,
    'Hybrid': 1,
    'On-site': 0
}

df['label'] = df['work_model'].map(work_model_mapping)
df = df.rename(columns={'dict.about': 'job_description'})

df.head(20)

Unnamed: 0,url,work_model,dict.job_title,dict.company_name,dict.location,job_description,dict.seniority_level,dict.employment_type,label
0,https://www.linkedin.com/jobs/view/coordinator...,Remote,"Coordinator, Digital Content",Cleveland Cavaliers,"Cleveland, Ohio, Estados Unidos",Cavaliers Holdings LLC is committed to deliver...,,,2
1,https://www.linkedin.com/jobs/view/instruction...,Remote,Instructional Designer,Zillow,Estados Unidos,About The TeamThe Zillow Group Talent Manageme...,,,2
2,https://www.linkedin.com/jobs/view/health-educ...,Remote,Health Educator/Training + Design Specialist,Essential Access Health,"Los Angeles, Califórnia, Estados Unidos",Job DescriptionGENERAL DUTIES:The Training + D...,,,2
3,https://www.linkedin.com/jobs/view/director-pe...,On-site,"Director, People Success Partner, GTM",Toast,Estados Unidos,"Director, People Success Partner (HRBP)Toast i...",,,0
4,https://www.linkedin.com/jobs/view/human-resou...,On-site,Human Resources (HR) Assistant,Intellyk Inc.,"Los Angeles, Califórnia, Estados Unidos",Job DescriptionJob Title:HR AssistantShift: 1s...,,,0
5,https://www.linkedin.com/jobs/view/legal-assis...,On-site,Legal Assistant,"Basheer Law Firm, P.C.","Sugar Land, Texas, Estados Unidos",Job DescriptionThis is a full-time on-site rol...,,,0
6,https://www.linkedin.com/jobs/view/administrat...,On-site,Not available,Not available,Not available,Not available,,,0
7,https://www.linkedin.com/jobs/view/operations-...,On-site,Operations Manager,Pepper Foster Consulting,Portland e Região,About UsPepper Foster Consulting is a strategy...,,,0
8,https://www.linkedin.com/jobs/view/junior-reac...,Hybrid,Junior React Developer,Team Remotely Inc,"Charlotte, Carolina do Norte, Estados Unidos","Junior React Developer(1 year experience, hybr...",,,1
9,https://www.linkedin.com/jobs/view/hr-operatio...,Remote,HR Operations Assistant,"Cprime, Inc",Estados Unidos,"A Goldman Sachs | Everstone company, Cprime is...",,,2


In [3]:
df['word_count'] = df['job_description'].apply(lambda x: len(x.split()))
mean_word_count = df['word_count'].mean()
print(f"The mean number of words in 'job_description' is: {mean_word_count}")

The mean number of words in 'job_description' is: 397.164314516129


In [None]:
# Load BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
def summarize_text(text):
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=30, max_length=200, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


df['about_sum'] = df['job_description'].apply(summarize_text)
print(df[['job_description', 'about_sum']])

In [6]:
df['word_count'] = df['about_sum'].apply(lambda x: len(x.split()))
mean_word_count = df['word_count'].mean()
print(f"The mean number of words in 'about_sum' is: {mean_word_count}")

The mean number of words in 'about_sum' is: 38.685483870967744


In [7]:
X = df['about_sum']
y = df['label']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train)

# Train Support Vector Machine model
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)

# Evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

print("Logistic Regression Model:")
evaluate_model(lr_model, X_test_tfidf, y_test)

print("Random Forest Model:")
evaluate_model(rf_model, X_test_tfidf, y_test)

print("Support Vector Machine Model:")
evaluate_model(svm_model, X_test_tfidf, y_test)


Logistic Regression Model:
Accuracy: 0.93
Random Forest Model:
Accuracy: 0.96
Support Vector Machine Model:
Accuracy: 0.95


In [13]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

tokenized = df['about_sum'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
max_len = max(map(len, tokenized))
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

labels = torch.tensor(df['label'])
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(np.where(padded != 0, 1, 0))

train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.2)
train_masks, test_masks, _, _ = train_test_split(attention_mask, input_ids, random_state=42, test_size=0.2)

batch_size = 32

# Create DataLoader for training and testing sets
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    true_labels_train = []
    predicted_labels_train = []
    
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        logits = outputs.logits
        _, predicted_labels = torch.max(logits, 1)
        true_labels_train.extend(b_labels.cpu().numpy())
        predicted_labels_train.extend(predicted_labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_dataloader)

    train_accuracy = accuracy_score(true_labels_train, predicted_labels_train)
    train_f1 = f1_score(true_labels_train, predicted_labels_train, average='macro')
    
    model.eval()
    predictions = []
    true_labels = []
    
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.append(np.argmax(logits, axis=1))
        true_labels.append(label_ids)

    predictions = np.concatenate(predictions)
    true_labels = np.concatenate(true_labels)

    # Calculate accuracy and F1 score 
    test_accuracy = accuracy_score(true_labels, predictions)
    test_f1 = f1_score(true_labels, predictions, average='macro')

    print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Training F1 Score: {train_f1:.4f}, Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}')

    
tokenized_original = df['about_sum'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

padded_original = np.array([i + [0]*(max_len-len(i)) for i in tokenized_original.values])

input_ids_original = torch.tensor(padded_original)
attention_mask_original = torch.tensor(np.where(padded_original != 0, 1, 0))

original_data = TensorDataset(input_ids_original, attention_mask_original)
original_sampler = SequentialSampler(original_data)
original_dataloader = DataLoader(original_data, sampler=original_sampler, batch_size=batch_size)

probabilities_per_row = []

# Predict probabilities for each row
model.eval()
for batch in original_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1).cpu().numpy()
    probabilities_per_row.extend(probabilities)

# Add the probabilities to the original DataFrame
df['probabilities'] = probabilities_per_row
print(df[['about_sum', 'probabilities']])

# Sanity check
example_text = "Remote work allows individuals to perform their job duties and tasks from a location outside of a traditional office environment, often using digital communication tools and technologies to collaborate with colleagues and complete work assignments efficiently."
encoded_text = tokenizer.encode_plus(example_text, add_special_tokens=True, max_length=max_len, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
input_ids = encoded_text['input_ids'].to(device)
attention_mask = encoded_text['attention_mask'].to(device)
with torch.no_grad():
    outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1).squeeze().cpu().numpy()
print("Confidence for being compatible for remote work:")
print(f"On-site: {probabilities[0]}")
print(f"Hybrid: {probabilities[1]}")
print(f"Remote: {probabilities[2]}")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Training Loss: 0.7608, Training Accuracy: 0.6494, Training F1 Score: 0.3220, Test Accuracy: 0.6432, Test F1 Score: 0.2610
Epoch 2/10, Training Loss: 0.5291, Training Accuracy: 0.8285, Training F1 Score: 0.6105, Test Accuracy: 0.8995, Test F1 Score: 0.7177
Epoch 3/10, Training Loss: 0.3219, Training Accuracy: 0.9319, Training F1 Score: 0.7883, Test Accuracy: 0.9246, Test F1 Score: 0.8371
Epoch 4/10, Training Loss: 0.1950, Training Accuracy: 0.9609, Training F1 Score: 0.8359, Test Accuracy: 0.9347, Test F1 Score: 0.8920
Epoch 5/10, Training Loss: 0.1134, Training Accuracy: 0.9760, Training F1 Score: 0.8962, Test Accuracy: 0.9548, Test F1 Score: 0.9288
Epoch 6/10, Training Loss: 0.0747, Training Accuracy: 0.9823, Training F1 Score: 0.9243, Test Accuracy: 0.9648, Test F1 Score: 0.9711
Epoch 7/10, Training Loss: 0.0442, Training Accuracy: 0.9924, Training F1 Score: 0.9705, Test Accuracy: 0.9598, Test F1 Score: 0.9522
Epoch 8/10, Training Loss: 0.0332, Training Accuracy: 0.9950, 

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


                                             about_sum  \
0    Cavaliers Holdings LLC is committed to deliver...   
1    The Instructional Designer partners closely wi...   
2    The Training + Design Specialist will be a mem...   
3    Director, People Success Partner (HRBP) Toast ...   
4    The Human Resources Assistant will directly as...   
..                                                 ...   
987  Leidos is seeking full-timeFHA Customer Servic...   
988  Magnet Medical is seeking a travel nurse RN Me...   
989  Actalent Services is seeking a CAD Designer wi...   
990  Actalent is hiring drafters and designers for ...   
991  Sunstates Security is currently hiring Securit...   

                                probabilities  
0      [0.009946599, 0.008302159, 0.98175126]  
1        [0.003748431, 0.004763612, 0.991488]  
2      [0.0036416485, 0.005002637, 0.9913557]  
3    [0.99462175, 0.0029507857, 0.0024274262]  
4     [0.99503917, 0.0026336508, 0.002327112]  
..             



Confidence for being compatible for remote work:
On-site: 0.05056440830230713
Hybrid: 0.00716816633939743
Remote: 0.9422674179077148


In [17]:
df['probabilities'] = df['probabilities'].apply(lambda x: list(map(float, x.strip('[]').split())))
df['On-site Score'] = df['probabilities'].apply(lambda x: round(x[0], 4))
df['Hybrid Score'] = df['probabilities'].apply(lambda x: round(x[1], 4))
df['Remote Score'] = df['probabilities'].apply(lambda x: round(x[2], 4))
df.drop('probabilities', axis=1, inplace=True)
df

Unnamed: 0,url,work_model,dict.job_title,dict.company_name,dict.location,job_description,dict.seniority_level,dict.employment_type,label,about_sum,word_count,On-site Score,Hybrid Score,Remote Score
0,https://www.linkedin.com/jobs/view/coordinator...,Remote,"Coordinator, Digital Content",Cleveland Cavaliers,"Cleveland, Ohio, Estados Unidos",Cavaliers Holdings LLC is committed to deliver...,,,2,Cavaliers Holdings LLC is committed to deliver...,47,0.173362,0.181677,0.644962
1,https://www.linkedin.com/jobs/view/instruction...,Remote,Instructional Designer,Zillow,Estados Unidos,About The TeamThe Zillow Group Talent Manageme...,,,2,The Instructional Designer partners closely wi...,47,0.198470,0.142221,0.659308
2,https://www.linkedin.com/jobs/view/health-educ...,Remote,Health Educator/Training + Design Specialist,Essential Access Health,"Los Angeles, Califórnia, Estados Unidos",Job DescriptionGENERAL DUTIES:The Training + D...,,,2,The Training + Design Specialist will be a mem...,44,0.133654,0.198731,0.667615
3,https://www.linkedin.com/jobs/view/director-pe...,On-site,"Director, People Success Partner, GTM",Toast,Estados Unidos,"Director, People Success Partner (HRBP)Toast i...",,,0,"Director, People Success Partner (HRBP) Toast ...",40,0.630245,0.174565,0.195190
4,https://www.linkedin.com/jobs/view/human-resou...,On-site,Human Resources (HR) Assistant,Intellyk Inc.,"Los Angeles, Califórnia, Estados Unidos",Job DescriptionJob Title:HR AssistantShift: 1s...,,,0,The Human Resources Assistant will directly as...,47,0.613476,0.195400,0.191124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,https://www.linkedin.com/jobs/view/fha-mortgag...,Remote,FHA Mortgage Origination - Customer Service,Leidos,United States,DescriptionLeidos is seeking full-timeFHA Cust...,Entry level,Full-time,2,Leidos is seeking full-timeFHA Customer Servic...,25,0.210612,0.184939,0.604449
988,https://www.linkedin.com/jobs/view/travel-rn-m...,On-site,Travel RN - Med Surg / Telemetry,Magnet Medical,"Manchester, CT",Magnet Medical is seeking a travel nurse RN Me...,Mid-Senior level,Part-time,0,Magnet Medical is seeking a travel nurse RN Me...,33,0.741238,0.152513,0.106249
989,https://www.linkedin.com/jobs/view/cad-drafter...,On-site,Cad Drafter,Actalent,"Bartlett, IL",Description:Actalent Services is seeking a CAD...,Entry level,Full-time,0,Actalent Services is seeking a CAD Designer wi...,29,0.668199,0.135343,0.196458
990,https://www.linkedin.com/jobs/view/cad-drafter...,On-site,Cad Drafter,Actalent,"Bartlett, IL",Actalent is currently hiring drafters and desi...,Entry level,Full-time,0,Actalent is hiring drafters and designers for ...,25,0.589532,0.188401,0.222067


In [18]:
df.to_csv('Remote_scored_data.csv', index=False)
