### IMPORT LIBRARY

In [5]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize
import time
import openpyxl
from openpyxl.styles import Alignment, PatternFill
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

# TF IDF

In [6]:
def preprocess_text_simple(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\*+', '', text)
    return text.strip()

def remove_asterisks(text):
    if pd.isna(text):
        return text
    return re.sub(r'\*+', '', text)

def load_and_preprocess_job_data(file_path):
    df = pd.read_csv(file_path)
    df['title'] = df['title'].apply(remove_asterisks)
    df['Combined'] = df['title'].fillna('') + ' ' + df['description_x'].fillna('') + ' ' + df['skills_desc'].fillna('')
    df['Combined'] = df['Combined'].apply(preprocess_text_simple)
    df = df.fillna("Unknown")
    return df.reset_index(drop=True)

def vectorize_text(df):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['Combined'])
    return vectorizer, tfidf_matrix

def recommend_job(user_input, df, vectorizer, tfidf_matrix, experience_levels=None, work_types=None, name=None):
    filtered_df = df.copy()
    if experience_levels:
        filtered_df = filtered_df[filtered_df['formatted_experience_level'].isin(experience_levels)]
    if work_types:
        filtered_df = filtered_df[filtered_df['formatted_work_type'].isin(work_types)]
    if name and name != 'All':
        filtered_df = filtered_df[filtered_df['name'] == name]
    
    if filtered_df.empty:
        return None

    user_input_processed = preprocess_text_simple(user_input)
    user_tfidf = vectorizer.transform([user_input_processed])
    
    cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix[filtered_df.index]).flatten()
    
    above_zero = cosine_similarities > 0
    if not any(above_zero):
        return None

    threshold = np.percentile(cosine_similarities[above_zero], 95)
    
    above_threshold = cosine_similarities >= threshold
    top_job_indices = np.where(above_threshold)[0]
    
    top_job_indices = top_job_indices[np.argsort(cosine_similarities[top_job_indices])[::-1]]
    
    top_jobs = filtered_df.iloc[top_job_indices].copy()
    top_jobs.reset_index(drop=True, inplace=True)
    
    top_jobs['cosine_similarity'] = cosine_similarities[top_job_indices]
    
    return top_jobs

def process_test_cases_and_save(test_cases_file, job_data_file, output_file):
    test_cases_df = pd.read_csv(test_cases_file)
    
    df = load_and_preprocess_job_data(job_data_file)
    vectorizer, tfidf_matrix = vectorize_text(df)

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Recommendations"

    headers = ['No', 'User Preferences', 'Recommendations', 'Scores']
    for col, header in enumerate(headers, start=1):
        ws.cell(row=1, column=col, value=header)

    row_counter = 2
    for idx, row in test_cases_df.iterrows():
        user_input = row['Case']
        
        experience_levels = []
        work_types = []
        name = 'All'
        
        recommendations = recommend_job(user_input, df, vectorizer, tfidf_matrix, experience_levels, work_types, name)
        
        if recommendations is not None and not recommendations.empty:
            rec_titles = recommendations['title'].tolist()[:10]
            scores = recommendations['cosine_similarity'].round(4).tolist()[:10]
        else:
            rec_titles = ["No relevant jobs found"]
            scores = ["N/A"]

        ws.cell(row=row_counter, column=1, value=idx + 1)
        ws.cell(row=row_counter, column=2, value=user_input)
        
        for rec, score in zip(rec_titles, scores):
            ws.cell(row=row_counter, column=3, value=rec)
            ws.cell(row=row_counter, column=4, value=score)
            row_counter += 1

        for _ in range(10 - len(rec_titles)):
            ws.cell(row=row_counter, column=3, value="")
            ws.cell(row=row_counter, column=4, value="")
            row_counter += 1

    for row in range(2, ws.max_row, 10):
        ws.merge_cells(start_row=row, start_column=1, end_row=row+9, end_column=1)
        ws.merge_cells(start_row=row, start_column=2, end_row=row+9, end_column=2)

    header_fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
    for cell in ws[1]:
        cell.fill = header_fill
        cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

    for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
        for cell in row:
            cell.alignment = Alignment(vertical='top', wrap_text=True)

    # Adjust column widths
    ws.column_dimensions['A'].width = 5
    ws.column_dimensions['B'].width = 50
    ws.column_dimensions['C'].width = 50
    ws.column_dimensions['D'].width = 15

    wb.save(output_file)
    print(f"Recommendations saved to '{output_file}'")

if __name__ == "__main__":
    test_cases_file = '/kaggle/input/cleans/cleaned_test_case_dataset.csv'
    job_data_file = '/kaggle/input/linkedin4/Tahap1_LinkedIn4.csv'
    output_file = 'recommendations.xlsx'
    
    process_test_cases_and_save(test_cases_file, job_data_file, output_file)

Recommendations saved to 'recommendations.xlsx'


# Word2Vec

In [7]:
nltk.download('punkt')

def preprocess_text_simple(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\*+', '', text)  
    return text.strip()

def remove_asterisks(text):
    if pd.isna(text):
        return text
    return re.sub(r'\*+', '', text)

def load_and_preprocess_job_data(file_path):
    df = pd.read_csv(file_path)
    df = df.drop_duplicates(subset=['company_id', 'title', 'description_x', 'location', 'url'])
    df['title'] = df['title'].apply(remove_asterisks)
    df['Combined'] = df['title'].fillna('') + ' ' + df['description_x'].fillna('') + ' ' + df['skills_desc'].fillna('')
    df['Combined'] = df['Combined'].apply(preprocess_text_simple)
    df['Tokenized'] = df['Combined'].apply(word_tokenize)
    df = df.fillna("Unknown")
    return df.reset_index(drop=True)

def train_word2vec(df):
    model = Word2Vec(sentences=df['Tokenized'], vector_size=100, window=5, min_count=1, workers=4)
    return model

def get_document_vector(doc, model):
    words = word_tokenize(doc)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

def vectorize_text(df, model):
    doc_vectors = np.array([get_document_vector(doc, model) for doc in df['Combined']])
    return doc_vectors

def recommend_job(user_input, df, model, doc_vectors, experience_levels=None, work_types=None, name=None):
    filtered_df = df.copy()
    if experience_levels:
        filtered_df = filtered_df[filtered_df['formatted_experience_level'].isin(experience_levels)]
    if work_types:
        filtered_df = filtered_df[filtered_df['formatted_work_type'].isin(work_types)]
    if name and name != 'All':
        filtered_df = filtered_df[filtered_df['name'] == name]
    
    if filtered_df.empty:
        return None

    user_input_processed = preprocess_text_simple(user_input)
    user_vector = get_document_vector(user_input_processed, model)
    
    cosine_similarities = cosine_similarity([user_vector], doc_vectors[filtered_df.index]).flatten()
    
    above_zero = cosine_similarities > 0
    if not any(above_zero):
        return None

    threshold = np.percentile(cosine_similarities[above_zero], 95)
    
    above_threshold = cosine_similarities >= threshold
    top_job_indices = np.where(above_threshold)[0]
    
    top_job_indices = top_job_indices[np.argsort(cosine_similarities[top_job_indices])[::-1]]
    
    top_jobs = filtered_df.iloc[top_job_indices].copy()
    top_jobs.reset_index(drop=True, inplace=True)
    
    top_jobs['cosine_similarity'] = cosine_similarities[top_job_indices]
    
    return top_jobs

def process_test_cases_and_save(test_cases_file, job_data_file, output_file, model, doc_vectors):
    test_cases_df = pd.read_csv(test_cases_file)
    
    df = load_and_preprocess_job_data(job_data_file)

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Recommendations"

    headers = ['No', 'User Preferences', 'Recommendations', 'Scores']
    for col, header in enumerate(headers, start=1):
        ws.cell(row=1, column=col, value=header)

    row_counter = 2
    for idx, row in test_cases_df.iterrows():
        user_input = row['Case']
        
        experience_levels = []
        work_types = []
        name = 'All'
        
        recommendations = recommend_job(user_input, df, model, doc_vectors, experience_levels, work_types, name)
        
        if recommendations is not None and not recommendations.empty:
            rec_titles = recommendations['title'].tolist()[:10]
            scores = recommendations['cosine_similarity'].round(4).tolist()[:10]
        else:
            rec_titles = ["No relevant jobs found"]
            scores = ["N/A"]

        ws.cell(row=row_counter, column=1, value=idx + 1)
        ws.cell(row=row_counter, column=2, value=user_input)
        
        for rec, score in zip(rec_titles, scores):
            ws.cell(row=row_counter, column=3, value=rec)
            ws.cell(row=row_counter, column=4, value=score)
            row_counter += 1

        for _ in range(10 - len(rec_titles)):
            ws.cell(row=row_counter, column=3, value="")
            ws.cell(row=row_counter, column=4, value="")
            row_counter += 1

    for row in range(2, ws.max_row, 10):
        ws.merge_cells(start_row=row, start_column=1, end_row=row+9, end_column=1)
        ws.merge_cells(start_row=row, start_column=2, end_row=row+9, end_column=2)

    header_fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
    for cell in ws[1]:
        cell.fill = header_fill
        cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

    for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
        for cell in row:
            cell.alignment = Alignment(vertical='top', wrap_text=True)

    ws.column_dimensions['A'].width = 5
    ws.column_dimensions['B'].width = 50
    ws.column_dimensions['C'].width = 50
    ws.column_dimensions['D'].width = 15

    wb.save(output_file)
    print(f"Recommendations saved to '{output_file}'")

if __name__ == "__main__":
    test_cases_file = '/kaggle/input/cleans/cleaned_test_case_dataset.csv'
    job_data_file = '/kaggle/input/joblinkedin/linkedin.csv'
    output_file = 'recommendationsword2vec.xlsx'
    
    df = load_and_preprocess_job_data(job_data_file)
    word2vec_model = train_word2vec(df)
    doc_vectors = vectorize_text(df, word2vec_model)
    
    process_test_cases_and_save(test_cases_file, job_data_file, output_file, word2vec_model, doc_vectors)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Recommendations saved to 'recommendationsword2vec.xlsx'


# BERT

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def preprocess_text_simple(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\*+', '', text)  
    return text.strip()

def remove_asterisks(text):
    if pd.isna(text):
        return text
    return re.sub(r'\*+', '', text)

def load_and_preprocess_job_data(file_path):
    df = pd.read_csv(file_path)
    df = df.drop_duplicates(subset=['company_id', 'title', 'description_x', 'location', 'url'])
    df['title'] = df['title'].apply(remove_asterisks)
    df['Combined'] = df['title'].fillna('') + ' ' + df['description_x'].fillna('') + ' ' + df['skills_desc'].fillna('')
    df['Combined'] = df['Combined'].apply(preprocess_text_simple)
    df = df.fillna("Unknown")
    return df.reset_index(drop=True)

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def vectorize_text(df):
    doc_vectors = []
    for doc in tqdm(df['Combined'], desc="Vectorizing documents"):
        doc_vectors.append(get_bert_embedding(doc))
    return np.array(doc_vectors)

def recommend_job(user_input, df, doc_vectors, experience_levels=None, work_types=None, name=None):
    filtered_df = df.copy()
    if experience_levels:
        filtered_df = filtered_df[filtered_df['formatted_experience_level'].isin(experience_levels)]
    if work_types:
        filtered_df = filtered_df[filtered_df['formatted_work_type'].isin(work_types)]
    if name and name != 'All':
        filtered_df = filtered_df[filtered_df['name'] == name]
    
    if filtered_df.empty:
        return None

    user_input_processed = preprocess_text_simple(user_input)
    user_vector = get_bert_embedding(user_input_processed)
    
    cosine_similarities = cosine_similarity([user_vector], doc_vectors[filtered_df.index]).flatten()
    
    above_zero = cosine_similarities > 0
    if not any(above_zero):
        return None

    threshold = np.percentile(cosine_similarities[above_zero], 95)
    
    above_threshold = cosine_similarities >= threshold
    top_job_indices = np.where(above_threshold)[0]
    
    top_job_indices = top_job_indices[np.argsort(cosine_similarities[top_job_indices])[::-1]]
    
    top_jobs = filtered_df.iloc[top_job_indices].copy()
    top_jobs.reset_index(drop=True, inplace=True)
    
    top_jobs['cosine_similarity'] = cosine_similarities[top_job_indices]
    
    return top_jobs

def process_test_cases_and_save(test_cases_file, job_data_file, output_file, doc_vectors):
    test_cases_df = pd.read_csv(test_cases_file)
    
    df = load_and_preprocess_job_data(job_data_file)

    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Recommendations"

    headers = ['No', 'User Preferences', 'Recommendations', 'Scores']
    for col, header in enumerate(headers, start=1):
        ws.cell(row=1, column=col, value=header)

    row_counter = 2
    for idx, row in tqdm(test_cases_df.iterrows(), total=len(test_cases_df), desc="Processing test cases"):
        user_input = row['Case']
        
        experience_levels = []
        work_types = []
        name = 'All'
        
        recommendations = recommend_job(user_input, df, doc_vectors, experience_levels, work_types, name)
        
        if recommendations is not None and not recommendations.empty:
            rec_titles = recommendations['title'].tolist()[:10]
            scores = recommendations['cosine_similarity'].round(4).tolist()[:10]
        else:
            rec_titles = ["No relevant jobs found"]
            scores = ["N/A"]

        ws.cell(row=row_counter, column=1, value=idx + 1)
        ws.cell(row=row_counter, column=2, value=user_input)
        
        for rec, score in zip(rec_titles, scores):
            ws.cell(row=row_counter, column=3, value=rec)
            ws.cell(row=row_counter, column=4, value=score)
            row_counter += 1

        for _ in range(10 - len(rec_titles)):
            ws.cell(row=row_counter, column=3, value="")
            ws.cell(row=row_counter, column=4, value="")
            row_counter += 1

    for row in range(2, ws.max_row, 10):
        ws.merge_cells(start_row=row, start_column=1, end_row=row+9, end_column=1)
        ws.merge_cells(start_row=row, start_column=2, end_row=row+9, end_column=2)

    header_fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
    for cell in ws[1]:
        cell.fill = header_fill
        cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

    for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
        for cell in row:
            cell.alignment = Alignment(vertical='top', wrap_text=True)

    ws.column_dimensions['A'].width = 5
    ws.column_dimensions['B'].width = 50
    ws.column_dimensions['C'].width = 50
    ws.column_dimensions['D'].width = 15

    wb.save(output_file)
    print(f"Recommendations saved to '{output_file}'")

if __name__ == "__main__":
    test_cases_file = '/kaggle/input/cleans/cleaned_test_case_dataset.csv'
    job_data_file = '/kaggle/input/joblinkedin/linkedin.csv'
    output_file = 'recommendations_bert.xlsx'
    
    df = load_and_preprocess_job_data(job_data_file)
    doc_vectors = vectorize_text(df)
    
    process_test_cases_and_save(test_cases_file, job_data_file, output_file, doc_vectors)

Vectorizing documents: 100%|██████████| 15309/15309 [2:00:16<00:00,  2.12it/s]  
Processing test cases: 100%|██████████| 50/50 [00:13<00:00,  3.77it/s]


Recommendations saved to 'recommendations_bert.xlsx'
