<a href="https://colab.research.google.com/github/semaHbo/job-clustering-salary-prediction/blob/main/predict_jobs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar -xzf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark

import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

findspark.init()


In [27]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("JobSalaryPrediction").getOrCreate()

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
import os
os.listdir("/content/drive/MyDrive/job-clustering-salary-prediction/data/raw")


['jobs_sample.csv']

In [30]:
from pyspark.sql.functions import col

df = df.withColumn("Yil", col("Yil").cast("int"))
df.printSchema()


root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- Deneyim_Seviyesi: string (nullable = true)
 |-- Calisma_Tipi: string (nullable = true)
 |-- Sirket_Buyuklugu: string (nullable = true)
 |-- Sirket_Ulke: string (nullable = true)
 |-- Remote_Tipi: string (nullable = true)
 |-- Kita: string (nullable = true)
 |-- Yil: integer (nullable = true)
 |-- pozisyon_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pozisyon_vec: vector (nullable = true)
 |-- Deneyim_Seviyesi_Encoded: double (nullable = false)
 |-- Calisma_Tipi_Encoded: double (nullable = false)
 |-- Sirket_Buyuklugu_Encoded: double (nullable = false)
 |-- Sirket_Ulke_Encoded: double (nullable = false)
 |-- Remote_Tipi_Encoded: double (nullable = false)
 |-- Kita_Encoded: double (nullable = false)



In [35]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, Word2Vec, StringIndexer, VectorAssembler
from pyspark.ml.regression import GBTRegressionModel
from pyspark.sql.functions import lower, regexp_replace
import pandas as pd

# Spark başlat
spark = SparkSession.builder.appName("PredictSalaries").getOrCreate()

# CSV dosyasının yolu
csv_path = "/content/drive/MyDrive/job-clustering-salary-prediction/data/raw/jobs_sample.csv"

#  Veriyi oku
df = spark.read.option("header", True).csv(csv_path)

#  Temizlik: açıklamaları normalize et
df = df.withColumn("description", lower(regexp_replace("description", "<[^>]+>", " ")))

#  Pozisyon sütununu tokenize et
tokenizer = Tokenizer(inputCol="description", outputCol="pozisyon_token")
df = tokenizer.transform(df)

# 4. Word2Vec modelini yükle veya yeniden fit et
w2v = Word2Vec(inputCol="pozisyon_token", outputCol="pozisyon_vec", vectorSize=10, minCount=1, seed=42)
w2v_model = w2v.fit(df)
df = w2v_model.transform(df)

# Kategorik sütunlar
kategorik_sutunlar = [
    "Deneyim_Seviyesi", "Calisma_Tipi", "Sirket_Buyuklugu",
    "Sirket_Ulke", "Remote_Tipi", "Kita"
]
from pyspark.sql.functions import col

df = df.withColumn("Yil", col("Yil").cast("int"))


# StringIndexer uygula
for col in kategorik_sutunlar:
    indexer = StringIndexer(inputCol=col, outputCol=f"{col}_Encoded", handleInvalid="keep")
    df = indexer.fit(df).transform(df)

#  VectorAssembler ile tüm özellikleri birleştir
ozellikler = [
    "pozisyon_vec",
    "Deneyim_Seviyesi_Encoded",
    "Calisma_Tipi_Encoded",
    "Sirket_Buyuklugu_Encoded",
    "Sirket_Ulke_Encoded",
    "Remote_Tipi_Encoded",
    "Kita_Encoded",
    "Yil"
]

assembler = VectorAssembler(inputCols=ozellikler, outputCol="features_enriched")
df = assembler.transform(df)

#  Eğitimli GBT modelini yükle(model_training_2 icindeki model)
model_path = "/content/drive/MyDrive/job-clustering-salary-prediction/models/gbt_salary_model"
model = GBTRegressionModel.load(model_path)

#  Tahmin
predictions = model.transform(df)

# Sonuçları pandas ile kaydet
results = predictions.select("id", "title", "prediction").toPandas()

import numpy as np
results["gercek_maas_usd"] = np.exp(results["prediction"])

output_path = "/content/drive/MyDrive/job-clustering-salary-prediction/data/output/predicted_salaries.csv"
results.to_csv(output_path, index=False)

print(" Tahmin tamamlandı. Sonuç kaydedildi:")
print(output_path)


 Tahmin tamamlandı. Sonuç kaydedildi:
/content/drive/MyDrive/job-clustering-salary-prediction/data/output/predicted_salaries.csv
