In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import time
from bs4 import BeautifulSoup

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nltk.download('punkt')
nltk.download('stopwords')

model = SentenceTransformer('all-mpnet-base-v2')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/trevor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/trevor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def html_to_string(html_content):
    try:
        # Kiểm tra nếu đầu vào là NaN hoặc không phải chuỗi
        if pd.isna(html_content) or not isinstance(html_content, str):
            return ""
        # Tạo đối tượng BeautifulSoup từ chuỗi HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        # Trích xuất text, loại bỏ thẻ HTML
        text = soup.get_text(separator=' ', strip=True)
        return text
    except Exception as e:
        return f"Đã có lỗi xảy ra: {str(e)}"

In [5]:
def preprocess_text(text):
    if pd.isna(text) or text == '':
        return ''
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return ' '.join(tokens)

In [None]:
labeled_data = pd.read_csv('data/labeled_data.csv')
job_posting = pd.read_csv('data/job_posting.csv', nrows=4306)
job_posting['description'] = job_posting['description'].apply(html_to_string)

labeled_data['experience_info'] = (
    '- Position:\n' + job_posting['title'].apply(preprocess_text) + '\n\n- Requirements: \n' +
    job_posting['description'].apply(preprocess_text)
)

In [36]:
job_posting_id = job_posting['id']
job_posting_id_to_idx = {key: value for key, value in enumerate(job_posting_id)}
job_posting_idx_to_id = {value: key for key, value in enumerate(job_posting_id)}

In [29]:
job_posting_id_to_idx[:5]

[{0: 950571233},
 {1: 1694753589},
 {2: 2020176787},
 {3: 2148428602},
 {4: 2315295279}]

In [7]:
experience_info_embeddings = model.encode(labeled_data['experience_info'].tolist(), show_progress_bar=True, batch_size=32)
experience_location_embeddings = model.encode(job_posting['location'].fillna('').tolist(), show_progress_bar=True, batch_size=32)

Batches: 100%|██████████| 135/135 [04:15<00:00,  1.90s/it]
Batches: 100%|██████████| 135/135 [00:06<00:00, 21.11it/s]


In [9]:
experience_info_embeddings.shape

(4306, 768)

In [10]:
labeled_data['profile_info'] = (
    '- Previous positions:\n' + labeled_data['headline'].apply(preprocess_text) + '\n\n- Experiences:\n' +
    labeled_data['summary'].apply(preprocess_text)
)

profile_info_embeddings = model.encode(labeled_data['profile_info'].tolist(), show_progress_bar=True, batch_size=32)
candidate_location_embeddings = model.encode(labeled_data['location_prof'].fillna('').tolist(), show_progress_bar=True, batch_size=32)

Batches: 100%|██████████| 135/135 [02:15<00:00,  1.00s/it]
Batches: 100%|██████████| 135/135 [00:08<00:00, 16.60it/s]


In [37]:
profile_id_to_public_id = {key: value for key, value in enumerate(labeled_data['public_id'])}
profile_public_id_to_id = {value: key for key, value in enumerate(labeled_data['public_id'])}

In [28]:
profile_id_to_public_id[:5]

[{0: 'karenpcantor'},
 {1: 'kathe-j-vera-804566113'},
 {2: 'andycarfax'},
 {3: 'carlos-eduardo-valentim'},
 {4: 'carlos-eduardo-valentim'}]

In [11]:

w1_scores = util.cos_sim(profile_info_embeddings, experience_info_embeddings).numpy()
w2_scores = util.cos_sim(candidate_location_embeddings, experience_location_embeddings).numpy()

w1_weight = 0.8
w2_weight = 0.2
final_scores = w1_weight * w1_scores + w2_weight * w2_scores

indices = np.arange(len(labeled_data))
final_similarity_scores = final_scores[indices, indices]

labeled_data['final_similarity_score'] = final_similarity_scores

# 4.5: Normalize Final Similarity Scores to [0, 1] using Min-Max Scaling
print("\nStarting Step 4.5: Normalize Final Similarity Scores to [0, 1]...")
start_time = time.time()

# Compute min and max of final_similarity_score
s_min = labeled_data['final_similarity_score'].min()
s_max = labeled_data['final_similarity_score'].max()

# Avoid division by zero
if s_max == s_min:
    normalized_scores = np.ones(len(labeled_data)) * 0.5  # If all scores are the same, set to 0.5
else:
    normalized_scores = (labeled_data['final_similarity_score'] - s_min) / (s_max - s_min)

labeled_data['normalized_score'] = normalized_scores

# Debug: Check the range of normalized scores
print(f"Min normalized score: {labeled_data['normalized_score'].min():.4f}")
print(f"Max normalized score: {labeled_data['normalized_score'].max():.4f}")

end_time = time.time()
print(f"Step 4.5 completed in {end_time - start_time:.2f} seconds")


Starting Step 4.5: Normalize Final Similarity Scores to [0, 1]...
Min normalized score: 0.0000
Max normalized score: 1.0000
Step 4.5 completed in 0.00 seconds


In [41]:
normalized_scores[0]

np.float32(0.68951225)

In [44]:
import json

top_k = 10
recommendations = []
output_file = 'result.json'

# Duyệt qua từng ứng viên
for candidate_idx in range(final_scores.shape[0]):
    # Lấy public_id của ứng viên
    candidate_id = profile_public_id_to_id.get(candidate_idx, profile_id_to_public_id[candidate_idx])
    
    # Lấy điểm số của ứng viên
    candidate_scores = final_scores[candidate_idx]
    
    # Lấy chỉ số của top-k công việc
    top_k_indices = np.argsort(candidate_scores)[::-1][:top_k]
    
    # Lấy job_id và similarity_score
    top_k_jobs = [
        {
            'job_id': job_posting_idx_to_id.get(j, job_posting_id_to_idx[j]),
            'similarity_score': float(candidate_scores[j])  # Chuyển sang float để lưu JSON
        }
        for j in top_k_indices
    ]
    
    # Thêm vào danh sách recommendations
    recommendations.append({
        'public_id': candidate_id,
        'recommendations': top_k_jobs
    })

# Lưu vào file JSON
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(recommendations, f, ensure_ascii=False, indent=4)

print(f"Đã lưu kết quả vào {output_file}")

Đã lưu kết quả vào result.json
