In [None]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Path ke file dan folder
csv_path = "/content/drive/MyDrive/dataset-6/linkedIn_posting/lowongan_kerja.csv"
detail_pekerjaan_folder = "/content/drive/MyDrive/dataset-6/linkedIn_posting/detail_pekerjaan/"
detail_perusahaan_folder = "/content/drive/MyDrive/dataset-6/linkedIn_posting/detail_perusahaan/"
mapping_folder = "/content/drive/MyDrive/dataset-6/linkedIn_posting/mapping/"

In [None]:
# Membaca data dari file CSV
lowongan_kerja = pd.read_csv('/content/drive/MyDrive/dataset-6/linkedIn_posting/lowongan_kerja.csv')

In [None]:
# Membaca data dari folder detail_pekerjaan
benefit = pd.read_csv(os.path.join(detail_pekerjaan_folder, "benefit.csv"))
industri_pekerjaan = pd.read_csv(os.path.join(detail_pekerjaan_folder, "industri_pekerjaan.csv"))
skill_pekerjaan = pd.read_csv(os.path.join(detail_pekerjaan_folder, "skill_pekerjaan.csv"))
upah = pd.read_csv(os.path.join(detail_pekerjaan_folder, "upah.csv"))

In [None]:
# Membaca data dari folder detail_perusahaan
industri_perusahaan = pd.read_csv(os.path.join(detail_perusahaan_folder, "industri_perusahaan.csv"))
jumlah_karyawan = pd.read_csv(os.path.join(detail_perusahaan_folder, "jumlah_karyawan.csv"))
perusahaan = pd.read_csv(os.path.join(detail_perusahaan_folder, "perusahaan.csv"))
spesialisasi_perusahaan = pd.read_csv(os.path.join(detail_perusahaan_folder, "spesialisasi_perusahaan.csv"))

In [None]:
# Membaca data dari folder mapping
industri = pd.read_csv(os.path.join(mapping_folder, "industri.csv"))
kemampuan = pd.read_csv(os.path.join(mapping_folder, "kemampuan.csv"))

MERGE DATA

In [None]:
# Menggabungkan data detail pekerjaan dengan lowongan kerja
data = pd.merge(lowongan_kerja, benefit, on='id_pekerjaan', how='left')
data = pd.merge(data, industri_pekerjaan, on='id_pekerjaan', how='left')
data = pd.merge(data, skill_pekerjaan, on='id_pekerjaan', how='left')
data = pd.merge(data, upah, on='id_pekerjaan', how='left')

In [None]:
# Menggabungkan data detail perusahaan dengan lowongan kerja
data = pd.merge(data, industri_perusahaan, on='id_perusahaan', how='left')
data = pd.merge(data, jumlah_karyawan, on='id_perusahaan', how='left')
data = pd.merge(data, perusahaan, on='id_perusahaan', how='left')
data = pd.merge(data, spesialisasi_perusahaan, on='id_perusahaan', how='left')

In [None]:
# Menggabungkan data mapping dengan lowongan kerja
data = pd.merge(data, industri.rename(columns={'id_industri': 'id_industri_lowongan', 'nama_industri': 'nama_industri_lowongan'}),
                left_on='id_industri', right_on='id_industri_lowongan', how='left')
data = pd.merge(data, kemampuan.rename(columns={'singkatan_kemampuan': 'singkatan_kemampuan_lowongan', 'nama_kemampuan': 'nama_kemampuan_lowongan'}),
                left_on='singkatan_kemampuan', right_on='singkatan_kemampuan_lowongan', how='left')

In [None]:
# Kolom yang dipertahankan
columns_to_keep = [
    'id_pekerjaan',
    'gaji_maksimal_y', 'gaji_median', 'gaji_minimal_y', 'periode_pembayaran_y',
    'lokasi', 'negara_bagian', 'negara', 'kota', 'kode_pos',
    'benefit', 'jenis',
    'nama_kemampuan_lowongan',
    'spesialisasi',
    'nama_industri_lowongan'
]

In [None]:
# Menghapus kolom yang tidak diperlukan
data = data[columns_to_keep]

In [None]:
# Menyimpan data gabungan
data.to_csv('datafix.csv', index=False)

In [None]:
# Membaca CSV baru
df = pd.read_csv("datafix.csv")

In [None]:
df

Unnamed: 0,id_pekerjaan,gaji_maksimal_y,gaji_median,gaji_minimal_y,periode_pembayaran_y,lokasi,negara_bagian,negara,kota,kode_pos,benefit,jenis,nama_kemampuan_lowongan,spesialisasi,nama_industri_lowongan
0,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,Digital Hearing Instruments,Medical Equipment Manufacturing
1,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,behind ear,Medical Equipment Manufacturing
2,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,in ear,Medical Equipment Manufacturing
3,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,open ear,Medical Equipment Manufacturing
4,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,audiologist,Medical Equipment Manufacturing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3296778,3958427,80000.0,,35000.0,YEARLY,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,salon,Personal Care Product Manufacturing
3296779,3958427,80000.0,,35000.0,YEARLY,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,extensions,Personal Care Product Manufacturing
3296780,3958427,80000.0,,35000.0,YEARLY,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,mens hair,Personal Care Product Manufacturing
3296781,3958427,80000.0,,35000.0,YEARLY,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,olaplex,Personal Care Product Manufacturing


In [None]:
# Fungsi konversi vektor
def convert_column(df, gaji_column):
    df[f'{gaji_column}_m_rp'] = df[gaji_column].copy()
    df.loc[df['periode_pembayaran_y'] == 'HOURLY', f'{gaji_column}'] *= 8 * 20
    df.loc[df['periode_pembayaran_y'] == 'YEARLY', f'{gaji_column}'] /= 12
    df.loc[df['periode_pembayaran_y'] == 'WEEKLY', f'{gaji_column}'] *= 4.34524

# List of gaji columns to process
gaji_columns = ['gaji_maksimal_y', 'gaji_median', 'gaji_minimal_y']

# Apply conversion function to each gaji column
for gaji_column in gaji_columns:
    convert_column(df, gaji_column)

# Tampilkan sebagian hasil untuk verifikasi
df

Unnamed: 0,id_pekerjaan,gaji_maksimal_y,gaji_median,gaji_minimal_y,periode_pembayaran_y,lokasi,negara_bagian,negara,kota,kode_pos,benefit,jenis,nama_kemampuan_lowongan,spesialisasi,nama_industri_lowongan,gaji_maksimal_y_m_rp,gaji_median_m_rp,gaji_minimal_y_m_rp
0,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,Digital Hearing Instruments,Medical Equipment Manufacturing,,5250.0,
1,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,behind ear,Medical Equipment Manufacturing,,5250.0,
2,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,in ear,Medical Equipment Manufacturing,,5250.0,
3,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,open ear,Medical Equipment Manufacturing,,5250.0,
4,3757940104,,5250.0,,MONTHLY,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,audiologist,Medical Equipment Manufacturing,,5250.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3296778,3958427,6666.666667,,2916.666667,YEARLY,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,salon,Personal Care Product Manufacturing,80000.0,,35000.0
3296779,3958427,6666.666667,,2916.666667,YEARLY,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,extensions,Personal Care Product Manufacturing,80000.0,,35000.0
3296780,3958427,6666.666667,,2916.666667,YEARLY,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,mens hair,Personal Care Product Manufacturing,80000.0,,35000.0
3296781,3958427,6666.666667,,2916.666667,YEARLY,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,olaplex,Personal Care Product Manufacturing,80000.0,,35000.0


In [None]:
df['gaji_maksimal_y'].min()

1.25

In [None]:
df['gaji_maksimal_y'].max()

583340833.3333334

In [None]:
df['gaji_minimal_y'].min()

1.0

In [None]:
df['gaji_minimal_y'].max()

24000000.0

In [None]:
def find_range(input_data, range_list, interval):
    for range_str in range_list:
        start, end = map(int, range_str.split('-'))
        if start <= input_data < end:
            return range_str
    return "> 1000000"

In [None]:
# Define the min and max data
min_data = 0
max_data = 1000000
interval = 5000

# Generate the list
range_list = [f"{i}-{i+interval}" for i in range(min_data, max_data, interval)]

In [None]:
# Tambahkan kolom range_gaji
df['range_gaji'] = df.apply(lambda row: find_range(row['gaji_minimal_y'], range_list, interval), axis=1)

In [None]:
# Hapus kolom gaji_minimal_y, gaji_maksimal_y, dan gaji_median
df.drop(['periode_pembayaran_y', 'gaji_minimal_y', 'gaji_maksimal_y', 'gaji_median','gaji_maksimal_y_m_rp','gaji_median_m_rp','gaji_minimal_y_m_rp'], axis=1, inplace=True)

df

Unnamed: 0,id_pekerjaan,lokasi,negara_bagian,negara,kota,kode_pos,benefit,jenis,nama_kemampuan_lowongan,spesialisasi,nama_industri_lowongan,range_gaji
0,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,Digital Hearing Instruments,Medical Equipment Manufacturing,> 1000000
1,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,behind ear,Medical Equipment Manufacturing,> 1000000
2,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,in ear,Medical Equipment Manufacturing,> 1000000
3,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,open ear,Medical Equipment Manufacturing,> 1000000
4,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,audiologist,Medical Equipment Manufacturing,> 1000000
...,...,...,...,...,...,...,...,...,...,...,...,...
3296778,3958427,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,salon,Personal Care Product Manufacturing,0-5000
3296779,3958427,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,extensions,Personal Care Product Manufacturing,0-5000
3296780,3958427,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,mens hair,Personal Care Product Manufacturing,0-5000
3296781,3958427,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,olaplex,Personal Care Product Manufacturing,0-5000


In [None]:
df

Unnamed: 0,id_pekerjaan,lokasi,negara_bagian,negara,kota,kode_pos,benefit,jenis,nama_kemampuan_lowongan,spesialisasi,nama_industri_lowongan,range_gaji
0,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,Digital Hearing Instruments,Medical Equipment Manufacturing,> 1000000
1,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,behind ear,Medical Equipment Manufacturing,> 1000000
2,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,in ear,Medical Equipment Manufacturing,> 1000000
3,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,open ear,Medical Equipment Manufacturing,> 1000000
4,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,1.0,Medical insurance,Other,audiologist,Medical Equipment Manufacturing,> 1000000
...,...,...,...,...,...,...,...,...,...,...,...,...
3296778,3958427,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,salon,Personal Care Product Manufacturing,0-5000
3296779,3958427,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,extensions,Personal Care Product Manufacturing,0-5000
3296780,3958427,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,mens hair,Personal Care Product Manufacturing,0-5000
3296781,3958427,"Chicago, IL",Illinois,US,Chicago,60647,0.0,Medical insurance,Information Technology,olaplex,Personal Care Product Manufacturing,0-5000


In [None]:
# Fungsi untuk menentukan deskripsi benefit
def deskripsi_benefit(row):
    if row['benefit'] == 0:
        return "tidak ada benefit"
    elif row['benefit'] == 1:
        return f"terdapat benefit yaitu {row['jenis']}"
    else:
        return "tidak diketahui"

# Menerapkan fungsi untuk kolom baru
df['deskripsi_benefit'] = df.apply(deskripsi_benefit, axis=1)

# Hapus kolom lama jika tidak diperlukan
df.drop(columns=['benefit', 'jenis'], inplace=True)

In [None]:
df

Unnamed: 0,id_pekerjaan,lokasi,negara_bagian,negara,kota,kode_pos,nama_kemampuan_lowongan,spesialisasi,nama_industri_lowongan,range_gaji,deskripsi_benefit
0,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,Digital Hearing Instruments,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
1,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,behind ear,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
2,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,in ear,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
3,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,open ear,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
4,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,audiologist,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
...,...,...,...,...,...,...,...,...,...,...,...
3296778,3958427,"Chicago, IL",Illinois,US,Chicago,60647,Information Technology,salon,Personal Care Product Manufacturing,0-5000,tidak ada benefit
3296779,3958427,"Chicago, IL",Illinois,US,Chicago,60647,Information Technology,extensions,Personal Care Product Manufacturing,0-5000,tidak ada benefit
3296780,3958427,"Chicago, IL",Illinois,US,Chicago,60647,Information Technology,mens hair,Personal Care Product Manufacturing,0-5000,tidak ada benefit
3296781,3958427,"Chicago, IL",Illinois,US,Chicago,60647,Information Technology,olaplex,Personal Care Product Manufacturing,0-5000,tidak ada benefit


DATA CLEAN

In [None]:
# Mencari missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 id_pekerjaan                   0
lokasi                         0
negara_bagian               4834
negara                      3863
kota                        3866
kode_pos                    6730
nama_kemampuan_lowongan    19022
spesialisasi               53056
nama_industri_lowongan      3015
range_gaji                     0
deskripsi_benefit              0
dtype: int64


In [None]:
# Menghapus baris dengan missing values pada kolom 'negara_bagian', 'negara', 'kota', dan 'kode_pos'
df = df.dropna(subset=['negara_bagian', 'negara', 'kota', 'kode_pos'])

In [None]:
# Fungsi untuk mengisi missing values dengan mode (nilai yang paling sering muncul)
def fill_with_mode(group):
    mode = group.mode()
    if not mode.empty:
        return group.fillna(mode.iloc[0])
    else:
        return group.fillna("Tidak Diketahui")

In [None]:
# Menghapus baris yang memiliki missing values pada kolom 'nama_kemampuan_lowongan'
df = df.dropna(subset=['nama_kemampuan_lowongan'])
df = df.dropna(subset=['nama_industri_lowongan'])

# Mengisi missing values pada kolom spesialisasi berdasarkan industri
df['spesialisasi'] = df.groupby('nama_industri_lowongan')['spesialisasi'].transform(fill_with_mode)

In [None]:
# Mencari missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 id_pekerjaan               0
lokasi                     0
negara_bagian              0
negara                     0
kota                       0
kode_pos                   0
nama_kemampuan_lowongan    0
spesialisasi               0
nama_industri_lowongan     0
range_gaji                 0
deskripsi_benefit          0
dtype: int64


In [None]:
#Mengatasi duplicate values
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(711063, 11)

In [None]:
df

Unnamed: 0,id_pekerjaan,lokasi,negara_bagian,negara,kota,kode_pos,nama_kemampuan_lowongan,spesialisasi,nama_industri_lowongan,range_gaji,deskripsi_benefit
0,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,Digital Hearing Instruments,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
1,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,behind ear,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
2,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,in ear,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
3,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,open ear,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
4,3757940104,"Little River, SC",New Jersey,US,Somerset,08873,Other,audiologist,Medical Equipment Manufacturing,> 1000000,terdapat benefit yaitu Medical insurance
...,...,...,...,...,...,...,...,...,...,...,...
3296778,3958427,"Chicago, IL",Illinois,US,Chicago,60647,Information Technology,salon,Personal Care Product Manufacturing,0-5000,tidak ada benefit
3296779,3958427,"Chicago, IL",Illinois,US,Chicago,60647,Information Technology,extensions,Personal Care Product Manufacturing,0-5000,tidak ada benefit
3296780,3958427,"Chicago, IL",Illinois,US,Chicago,60647,Information Technology,mens hair,Personal Care Product Manufacturing,0-5000,tidak ada benefit
3296781,3958427,"Chicago, IL",Illinois,US,Chicago,60647,Information Technology,olaplex,Personal Care Product Manufacturing,0-5000,tidak ada benefit


In [None]:
from google.colab import files

# Simpan dataframe sebagai file CSV di penyimpanan lokal Colab
df.to_csv('final_data.csv', index=False)

# Unduh file CSV ke komputer Anda
files.download('final_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Gabungkan semua kolom teks menjadi satu
df['combined_features'] = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Inisialisasi TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit dan transform data
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

# Fungsi untuk memberikan rekomendasi pekerjaan
def recommend_jobs(user_input, df, tfidf_matrix):
    # Transform user input menjadi vektor TF-IDF
    user_tfidf = vectorizer.transform([user_input])

    # Hitung cosine similarity antara input pengguna dan semua pekerjaan
    cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()

    # Dapatkan indeks pekerjaan dengan similarity tertinggi
    similar_indices = cosine_similarities.argsort()[-10:][::-1]  # ambil 5 pekerjaan teratas

    # Ambil pekerjaan yang sesuai berdasarkan indeks
    recommendations = df.iloc[similar_indices]
    return recommendations

In [None]:
# Contoh input pengguna
user_input = input("Masukkan informasi pekerjaan yang Anda cari (contoh: 'Dallas Python Data Science Technology 9000 Health insurance'): ")

# Dapatkan rekomendasi
recommendations = recommend_jobs(user_input, df, tfidf_matrix)

recommendations

Masukkan informasi pekerjaan yang Anda cari (contoh: 'Dallas Python Data Science Technology 9000 Health insurance'): healht


Unnamed: 0,id_pekerjaan,lokasi,negara_bagian,negara,kota,kode_pos,nama_kemampuan_lowongan,spesialisasi,nama_industri_lowongan,range_gaji,deskripsi_benefit,combined_features
3296782,3958427,"Chicago, IL",Illinois,US,Chicago,60647,Information Technology,women cuts,Personal Care Product Manufacturing,0-5000,tidak ada benefit,"3958427 Chicago, IL Illinois US Chicago 60647 ..."
1138230,3755592400,"Miami, FL",Oklahoma,US,Oklahoma City,73142,Project Management,HR,Financial Services,> 1000000,terdapat benefit yaitu 401(k),"3755592400 Miami, FL Oklahoma US Oklahoma City..."
1138238,3755592400,"Miami, FL",Oklahoma,US,Oklahoma City,73142,Project Management,Talent Acquisition,Financial Services,> 1000000,terdapat benefit yaitu 401(k),"3755592400 Miami, FL Oklahoma US Oklahoma City..."
1138237,3755592400,"Miami, FL",Oklahoma,US,Oklahoma City,73142,Project Management,Time and Attendance,Financial Services,> 1000000,terdapat benefit yaitu 401(k),"3755592400 Miami, FL Oklahoma US Oklahoma City..."
1138236,3755592400,"Miami, FL",Oklahoma,US,Oklahoma City,73142,Project Management,SaaS,Financial Services,> 1000000,terdapat benefit yaitu 401(k),"3755592400 Miami, FL Oklahoma US Oklahoma City..."
1138235,3755592400,"Miami, FL",Oklahoma,US,Oklahoma City,73142,Project Management,Software-as-a-Service,Financial Services,> 1000000,terdapat benefit yaitu 401(k),"3755592400 Miami, FL Oklahoma US Oklahoma City..."
1138234,3755592400,"Miami, FL",Oklahoma,US,Oklahoma City,73142,Project Management,Benefits Administration,Financial Services,> 1000000,terdapat benefit yaitu 401(k),"3755592400 Miami, FL Oklahoma US Oklahoma City..."
1138233,3755592400,"Miami, FL",Oklahoma,US,Oklahoma City,73142,Project Management,HRIS,Financial Services,> 1000000,terdapat benefit yaitu 401(k),"3755592400 Miami, FL Oklahoma US Oklahoma City..."
1138232,3755592400,"Miami, FL",Oklahoma,US,Oklahoma City,73142,Project Management,Tax Credits,Financial Services,> 1000000,terdapat benefit yaitu 401(k),"3755592400 Miami, FL Oklahoma US Oklahoma City..."
1138231,3755592400,"Miami, FL",Oklahoma,US,Oklahoma City,73142,Project Management,Human Resources,Financial Services,> 1000000,terdapat benefit yaitu 401(k),"3755592400 Miami, FL Oklahoma US Oklahoma City..."


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

# Gabungkan semua kolom teks menjadi satu
df['combined_features'] = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Inisialisasi TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit dan transform data
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

# Fungsi untuk memberikan rekomendasi pekerjaan
def recommend_jobs(user_input, df, tfidf_matrix):
    # Transform user input menjadi vektor TF-IDF
    user_tfidf = vectorizer.transform([user_input])

    # Hitung cosine similarity antara input pengguna dan semua pekerjaan
    cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()

    # Dapatkan indeks pekerjaan dengan similarity tertinggi
    similar_indices = cosine_similarities.argsort()[-10:][::-1]

    # Ambil pekerjaan yang sesuai berdasarkan indeks
    recommendations = df.iloc[similar_indices]

    # Filter out rows with 'tidak ada benefit' or 'tidak diketahui' in 'deskripsi_benefit'
    recommendations = recommendations[~recommendations['deskripsi_benefit'].str.contains("tidak ada benefit|tidak diketahui")]

    return recommendations

# Contoh input pengguna
user_input = "Dallas Python Data Science Technology 9000 Health insurance"

# Dapatkan rekomendasi
recommendations = recommend_jobs(user_input, df, tfidf_matrix)

# Menghitung gaji rata-rata
def calculate_average_salary(range_gaji):
    if range_gaji.startswith('>'):
        return float(range_gaji[1:])  # Remove '>' and convert to float
    else:
        lower, upper = map(int, range_gaji.split('-'))
        return (lower + upper) / 2

df['average_salary'] = df['range_gaji'].apply(calculate_average_salary)

# Tambahkan kolom gaji rata-rata ke dalam rekomendasi
recommendations = recommendations.copy()  # Avoid SettingWithCopyWarning
recommendations['average_salary'] = recommendations['range_gaji'].apply(calculate_average_salary)

# Urutkan rekomendasi berdasarkan gaji rata-rata
recommendations = recommendations.sort_values(by='average_salary', ascending=False).head(10)

# Misalkan kita memiliki label asli (true_labels) untuk seluruh dataset
# Dalam kasus nyata, Anda perlu mengganti dengan data asli
true_labels = [1 if 'benefit' in benefit and calculate_average_salary(salary) > 5000 else 0 for benefit, salary in zip(df['deskripsi_benefit'], df['range_gaji'])]  # Dummy labels berdasarkan gaji dan benefit
predicted_labels = [1 if i in recommendations.index else 0 for i in df.index]  # Contoh prediksi dari rekomendasi

# Hitung Precision, Recall, dan F1-Score
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print("Top rekomendasi pekerjaan dengan gaji dan benefit terbaik:")
print(recommendations)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)


Top rekomendasi pekerjaan dengan gaji dan benefit terbaik:
         id_pekerjaan      lokasi negara_bagian negara    kota kode_pos  \
1678145    3701371097  Dallas, TX         Texas     US  Dallas    75201   
1678132    3701371097  Dallas, TX         Texas     US  Dallas    75201   

        nama_kemampuan_lowongan  spesialisasi nama_industri_lowongan  \
1678145           Manufacturing  Data Science              Insurance   
1678132              Management  Data Science              Insurance   

         range_gaji                        deskripsi_benefit  \
1678145  5000-10000  terdapat benefit yaitu Vision insurance   
1678132  5000-10000  terdapat benefit yaitu Vision insurance   

                                         combined_features  average_salary  
1678145  3701371097 Dallas, TX Texas US Dallas 75201 Ma...          7500.0  
1678132  3701371097 Dallas, TX Texas US Dallas 75201 Ma...          7500.0  

Precision: 1.0
Recall: 7.99888015677805e-06
F1-Score: 1.599763235041214e-