In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
!pip install datasets
!pip install sentence_transformers datasets
!pip install -U sentence-transformers
!pip install tensorflow_hub tensorflow_text pandas scikit-learn



In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf
import joblib
from tensorflow import keras
from tensorflow.keras import layers, regularizers
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Import dataset

#### load data

In [14]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/dataset/jobstreet_all_job_dataset.csv")
print(df.head(10))

       job_id                                     job_title  \
0  74630583.0              Procurement Executive (Contract)   
1  74660602.0                  Account Executive/ Assistant   
2  74655679.0  Data Analyst - Asset Management, SPX Express   
3  74657624.0                              Service Engineer   
4  74679363.0                          Purchasing Executive   
5  74661023.0                            Accounts Executive   
6  74677110.0                            Accounts Executive   
7  74657915.0                          PURCHASING EXECUTIVE   
8  74662794.0                       Accounts Assistant (AP)   
9  74602737.0                               Admin Assistant   

                                  company  \
0   Coca-Cola Bottlers (Malaysia) Sdn Bhd   
1      Acoustic & Lighting System Sdn Bhd   
2          Shopee Mobile Malaysia Sdn Bhd   
3             Sun Medical Systems Sdn Bhd   
4  Magnet Security & Automation Sdn. Bhd.   
5                 HPMT Industries Sd

In [15]:
# data kotor
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62237 entries, 0 to 62236
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   job_id        62237 non-null  float64
 1   job_title     62237 non-null  object 
 2   company       62237 non-null  object 
 3   descriptions  62237 non-null  object 
 4   location      62237 non-null  object 
 5   category      62237 non-null  object 
 6   subcategory   62237 non-null  object 
 7   role          60187 non-null  object 
 8   type          62237 non-null  object 
 9   salary        28141 non-null  object 
 10  listingDate   62237 non-null  object 
dtypes: float64(1), object(10)
memory usage: 5.2+ MB


Unnamed: 0,0
job_id,0
job_title,0
company,0
descriptions,0
location,0
category,0
subcategory,0
role,2050
type,0
salary,34096


#### cleaning

In [16]:
# Hapus baris yang memiliki nilai kosong pada kolom penting
df_cleaned = df.dropna(subset=['job_id', 'job_title', 'company', 'descriptions', 'location', 'category', 'subcategory', 'type', 'listingDate'])

# Untuk kolom 'role' dan 'salary' yang banyak kosong, bisa isi dengan nilai default atau median (jika numerik)
df_cleaned['role'].fillna('Unknown', inplace=True)
df_cleaned['salary'].fillna('Not specified', inplace=True)

# Mengubah tipe data dari 'listingDate' menjadi 'datetime'
df_cleaned['listingDate'] = pd.to_datetime(df['listingDate'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['role'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['salary'].fillna('Not specified', inplace=True)


In [17]:
# Cek hasil
df_cleaned.info()
df_cleaned.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62237 entries, 0 to 62236
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   job_id        62237 non-null  float64            
 1   job_title     62237 non-null  object             
 2   company       62237 non-null  object             
 3   descriptions  62237 non-null  object             
 4   location      62237 non-null  object             
 5   category      62237 non-null  object             
 6   subcategory   62237 non-null  object             
 7   role          62237 non-null  object             
 8   type          62237 non-null  object             
 9   salary        62237 non-null  object             
 10  listingDate   62237 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(1), object(9)
memory usage: 5.2+ MB


Unnamed: 0,0
job_id,0
job_title,0
company,0
descriptions,0
location,0
category,0
subcategory,0
role,0
type,0
salary,0


In [18]:
# simpan
df_cleaned.to_csv("data_bersih.csv", index=False)

### Machine Learning

In [19]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Import dataset rekomendasi
df_recomendation = pd.read_csv('/content/drive/MyDrive/dataset/dataset_rekomendasi_pekerjaan.csv')

# Bersihkan dan gabungkan teks
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

df_recomendation['combined_text'] = (df_recomendation['user_text'] + ' ' + df_recomendation['job_text']).apply(clean_text)

# TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.9, min_df=5)
X = vectorizer.fit_transform(df_recomendation['combined_text'])
y = df_recomendation['label']

# Split data dan latih model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# STEP 4: Evaluasi model
y_pred = model.predict(X_test)
print("Akurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# STEP 5: Fungsi prediksi baru
def predict_match(user_text, job_text):
    combined = clean_text(user_text + ' ' + job_text)
    vector = vectorizer.transform([combined])
    prediction = model.predict(vector)[0]
    probability = model.predict_proba(vector)[0][1]  # Probabilitas cocok
    return prediction, probability

Akurasi: 0.9804772234273319

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.95      0.97       364
         1.0       0.97      1.00      0.98       558

    accuracy                           0.98       922
   macro avg       0.98      0.98      0.98       922
weighted avg       0.98      0.98      0.98       922


Confusion Matrix:
 [[346  18]
 [  0 558]]


In [27]:
def recommend_jobs(user_text, top_n=5):
    # Menyiapkan semua deskripsi pekerjaan
    job_texts = df_recomendation['job_text'].tolist()
    job_ids = df_recomendation.index.tolist()

    # Simpan hasil prediksi
    predictions = []
    for i, job in enumerate(job_texts):
        combined = clean_text(user_text + ' ' + job)
        vector = vectorizer.transform([combined])
        prob = model.predict_proba(vector)[0][1]  # Probabilitas cocok
        predictions.append((i, job, prob))

    # Ambil top-N pekerjaan dengan skor tertinggi
    predictions = sorted(predictions, key=lambda x: x[2], reverse=True)
    top_jobs = predictions[:top_n]

    # Tampilkan hasil
    print(f"\n=== Rekomendasi Pekerjaan Teratas untuk User ===")
    for i, (idx, job_desc, prob) in enumerate(top_jobs, 1):
        title = df_recomendation.loc[idx, 'job_text'][:100].replace('\n', ' ') + "..."
        print(f"{i}. Skor Kecocokan: {prob:.2f}")
        print(f"   Job: {title}\n")

In [29]:
user_input = """
Saya adalah seorang desainer grafis yang mahir menggunakan Adobe Illustrator, Photoshop, dan Canva.
Saya memiliki portofolio desain logo, brosur, dan konten media sosial.
Saya juga mampu bekerja dengan tim kreatif dan memahami tren desain terkini.
"""

recommend_jobs(user_input, top_n=5)


=== Rekomendasi Pekerjaan Teratas untuk User ===
1. Skor Kecocokan: 0.69
   Job: We are seeking a highly motivated, passionate, and tech-savvy Data Support and Analyst who will be s...

2. Skor Kecocokan: 0.69
   Job: We are seeking a highly motivated, passionate, and tech-savvy Data Support and Analyst who will be s...

3. Skor Kecocokan: 0.69
   Job: We are seeking a highly motivated, passionate, and tech-savvy Data Support and Analyst who will be s...

4. Skor Kecocokan: 0.69
   Job: We are seeking a highly motivated, passionate, and tech-savvy Data Support and Analyst who will be s...

5. Skor Kecocokan: 0.69
   Job: We are seeking a highly motivated, passionate, and tech-savvy Data Support and Analyst who will be s...



In [30]:
# Simpan model dan vectorizer
joblib.dump(model, 'model_workfinder.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']