In [83]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re

In [90]:
# Load Data
job_df = pd.read_csv('jobs_data.csv')
course_df = pd.read_csv('courses_data.csv')

### Vectorization Using Word2Vec

In [91]:
# Preprocessing job tokens
job_df['Skill_Set'] = job_df['Skill_Set'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else []
)
job_df['Skill_Set'] = job_df['Skill_Set'].apply(
    lambda x: [s.strip().lower() for s in x if isinstance(s, str)]
)
job_df['Subcategory'] = job_df['Subcategory'].fillna('').str.lower().str.split()
job_df['tokens'] = job_df['Skill_Set'] + job_df['Subcategory']
job_df['tokens'] = job_df['tokens'].apply(
    lambda tokens: [t for t in tokens if re.match(r'^[a-zA-Z\s\-]+$', t.strip()) and t.strip()]
)

In [None]:
# Preprocessing course tokens
course_df['Skills'] = course_df['Skills'].fillna('')
course_df['tokens'] = course_df.apply(
    lambda row: [s.strip().lower() for s in row['Skills'].split(',') if s.strip()],
    axis=1
)
course_df['tokens'] = course_df['tokens'].apply(
    lambda tokens: [t for t in tokens if re.match(r'^[a-zA-Z\s\-]+$', t.strip()) and t.strip()]
)

In [94]:
# Word2Vec Training
sentences = job_df['tokens'].tolist() + course_df['tokens'].tolist()
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def get_mean_vector(tokens, model):
    valid_tokens = [token for token in tokens if token in model.wv]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[valid_tokens], axis=0)

In [None]:
# Embedding Vectorization
# Convert tokenized text in both datasets into numerical vector representations using the mean of Word2Vec embeddings
job_df['embedding'] = job_df['tokens'].apply(lambda x: get_mean_vector(x, w2v_model))
course_df['embedding'] = course_df['tokens'].apply(lambda x: get_mean_vector(x, w2v_model))

job_df_out = job_df.copy()
job_df_out['embedding'] = job_df_out['embedding'].apply(lambda x: ','.join(map(str, x)))
job_df_out.to_csv("job_vectorization.csv", index=False)

course_df_out = course_df.copy()
course_df_out['embedding'] = course_df_out['embedding'].apply(lambda x: ','.join(map(str, x)))
course_df_out.to_csv("course_vectorization.csv", index=False)

### Similarity Mapping

In [None]:
# Similarity Mapping (Top 20 for each Course)
similarity_results = []

for _, course_row in course_df.iterrows():
    course_title = course_row['Title']
    course_vec = course_row['embedding'].reshape(1, -1)

    job_embeddings = np.vstack(job_df['embedding'].values)
    similarities = cosine_similarity(course_vec, job_embeddings)[0]

    top_indices = similarities.argsort()[::-1][:20]

    for job_idx in top_indices:
        similarity_results.append({
            'Course_Title': course_title,
            'Job_Title': job_df.iloc[job_idx]['Title'],
            'Job_Company': job_df.iloc[job_idx].get('Company', ''),
            'Job_Category': job_df.iloc[job_idx]['Category'],
            'Job_Subcategory': ' '.join(job_df.iloc[job_idx]['Subcategory']) if isinstance(job_df.iloc[job_idx]['Subcategory'], list) else job_df.iloc[job_idx]['Subcategory'],
            'Job_Skills': '; '.join(job_df.iloc[job_idx]['Skill_Set']),
            'Similarity_Score': round(similarities[job_idx], 4)
        })

similarity_df = pd.DataFrame(similarity_results)
similarity_df.to_csv("course_job_similarity_mapping_top20.csv", index=False)

# Similarity Mapping (All Pair)
similarity_results = []

for _, course_row in course_df.iterrows():
    course_title = course_row['Title']
    course_vec = course_row['embedding'].reshape(1, -1)

    job_embeddings = np.vstack(job_df['embedding'].values)
    similarities = cosine_similarity(course_vec, job_embeddings)[0]

    for job_idx, sim_score in enumerate(similarities):
        similarity_results.append({
            'Course_Title': course_title,
            'Job_Title': job_df.iloc[job_idx]['Title'],
            'Job_Category': job_df.iloc[job_idx]['Category'],
            'Job_Subcategory': job_df.iloc[job_idx]['Subcategory'],
            'Job_Skills': '; '.join(job_df.iloc[job_idx]['Skill_Set']),
            'Similarity_Score': round(sim_score, 4)
        })
similarity_df = pd.DataFrame(similarity_results)
similarity_df.to_csv("course_job_similarity_mapping_all.csv", index=False)

### Recommendation System

In [98]:
def recommend_jobs_from_course(course_title, top_n=5, selected_category=None, use_category_filter=True):
    course_row = course_df[course_df['Title'] == course_title]
    if course_row.empty:
        print(f"Course '{course_title}' not found!")
        return None

    course_vec = course_row.iloc[0]['embedding'].reshape(1, -1)

    if use_category_filter and selected_category:
        filtered_jobs = job_df[job_df['Category'].str.lower() == selected_category.lower()]
        if filtered_jobs.empty:
            print(f"No jobs found in the category '{selected_category}'!")
            return None
    else:
        filtered_jobs = job_df

    job_embeddings = np.vstack(filtered_jobs['embedding'].values)
    similarities = cosine_similarity(course_vec, job_embeddings)[0]

    top_indices = similarities.argsort()[::-1][:top_n]
    recommended_jobs = filtered_jobs.iloc[top_indices].copy()
    recommended_jobs['Similarity_Score'] = similarities[top_indices]
    recommended_jobs['Similarity_Score'] = recommended_jobs['Similarity_Score'].round(4)

    recommended_jobs['Course_Title'] = course_title

    return recommended_jobs[[
        'Course_Title',
        'Title',
        'Company',
        'Category'
    ]]

### Example 1: Without Category Filter

In [110]:
result = recommend_jobs_from_course(
    course_title="Computer Security and Systems Management",
    top_n=5,
    use_category_filter=False
)
result

Unnamed: 0,Course_Title,Title,Company,Category
1084,Computer Security and Systems Management,Dosen Teknologi Informasi,Politeknik Gajah Tunggal,Pendidikan Dan Pelatihan
328,Computer Security and Systems Management,IT Programmer,Gui Group,IT / Information Technology
1634,Computer Security and Systems Management,Staff Backend Programmer,PT Jaringan Automasi Digital Indonesia,IT / Information Technology
1425,Computer Security and Systems Management,Staff Teknisi CCTV,PT Fokus Vision Teknologi,IT / Information Technology
455,Computer Security and Systems Management,Guru Teknik Komputer dan Jaringan,SMK Media Informatika,Pendidikan Dan Pelatihan


### Example 2: Using Category Filter

In [113]:
result = recommend_jobs_from_course(
    course_title="Digital Marketing",
    selected_category="Marketing / Pemasaran",
    top_n=5,
    use_category_filter=True
)
result

Unnamed: 0,Course_Title,Title,Company,Category
1656,Digital Marketing,Marketing Executive,PT Labodia Prima,Marketing / Pemasaran
984,Digital Marketing,Social Media Specialist,CV Habeha Multimedia,Marketing / Pemasaran
1581,Digital Marketing,Marketing,PT Indogc Ceramic International,Marketing / Pemasaran
2009,Digital Marketing,SEO & SEM Advertiser,PT Centrum Bazaar Nusantara,Marketing / Pemasaran
484,Digital Marketing,Marketing Konstruksi,Royal Stone Jakarta,Marketing / Pemasaran
