<a href="https://colab.research.google.com/github/semaHbo/job-clustering-salary-prediction/blob/main/model_gbt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar -xzf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark


In [3]:
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("GBT_Model").getOrCreate()


In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
!ls /content/drive/MyDrive/datasets


archive.zip


In [8]:
train_df = spark.read.parquet("/content/drive/MyDrive/datasets/train_df.parquet")
test_df = spark.read.parquet("/content/drive/MyDrive/datasets/test_df.parquet")


In [9]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Modeli tanımlama
gbt = GBTRegressor(
    featuresCol="features",
    labelCol="Log_Maas_USD",
    predictionCol="prediction",
    maxIter=100,  # toplam ağaç sayısı
    maxDepth=5,   # ağaçların derinliği
    seed=42
)

# Eğitimi başlatma
gbt_model = gbt.fit(train_df)

# Test seti üzerinde tahmin yapma
predictions = gbt_model.transform(test_df)

# R² skorunu hesaplama
evaluator = RegressionEvaluator(
    labelCol="Log_Maas_USD",
    predictionCol="prediction",
    metricName="r2"
)

r2_score = evaluator.evaluate(predictions)
print(" GBT R² Skoru:", round(r2_score, 4))


 GBT R² Skoru: 0.5453


In [10]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, PCA
from pyspark.ml import Pipeline


In [11]:
# Tokenizer: metni kelimelere ayırmak icin
tokenizer = Tokenizer(inputCol="Pozisyon", outputCol="pozisyon_token")

# HashingTF: kelimeleri sayısal temsile dönüştürmek icin
hashingTF = HashingTF(inputCol="pozisyon_token", outputCol="pozisyon_tf", numFeatures=100)

# IDF: kelimelerin önem ağırlıklarını hesaplamak icin
idf = IDF(inputCol="pozisyon_tf", outputCol="pozisyon_tfidf")

# PCA: Boyutu azaltma (10 bileşene indiriyoruz)
pca = PCA(k=10, inputCol="pozisyon_tfidf", outputCol="pozisyon_vec")

# Pipeline: tüm adımları zincirlemak icin
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, pca])


In [12]:
# train_df ve test_df'yi birleştirme dönüşüm tek seferde yapılsın
df_all = train_df.union(test_df)

# Pipelineı eğitme ve uygulama
model = pipeline.fit(df_all)
df_transformed = model.transform(df_all)


In [14]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["pozisyon_vec"],  # daha sonra diğer feature'ları da ekleyebiliriz
    outputCol="features_new"     # önceki 'features' ile çakışmaması için yeni ad
)

df_ready = assembler.transform(df_transformed)


In [15]:
from pyspark.sql.functions import monotonically_increasing_id

df_ready_indexed = df_ready.withColumn("row_id", monotonically_increasing_id())


In [17]:
train_df_indexed = train_df.withColumn("row_id", monotonically_increasing_id())
test_df_indexed = test_df.withColumn("row_id", monotonically_increasing_id())
#setlere indeks ekleme

In [21]:
df_features = df_ready_indexed.select("row_id", "features_new")


In [19]:
train_df_final = train_df_indexed.join(df_features, on=["row_id"], how="inner")
test_df_final = test_df_indexed.join(df_features, on=["row_id"], how="inner")
#Merge işlemi: train_df_indexed ve df_features birleştirilir

In [23]:
train_df_indexed_clean = train_df_indexed.drop("Log_Maas_USD")
test_df_indexed_clean = test_df_indexed.drop("Log_Maas_USD")
#Hem train_df_indexed hem test_df_indexed'den Log_Maas_USD’yi cikarma

In [24]:
df_features = df_ready_indexed.select("row_id", "features_new", "Log_Maas_USD")


In [25]:
train_df_final = train_df_indexed_clean.join(df_features, on="row_id", how="inner")
test_df_final = test_df_indexed_clean.join(df_features, on="row_id", how="inner")


In [26]:
#model egitimi
gbt = GBTRegressor(
    featuresCol="features_new",
    labelCol="Log_Maas_USD",
    predictionCol="prediction",
    maxIter=100,
    maxDepth=5,
    seed=42
)

model = gbt.fit(train_df_final)
predictions = model.transform(test_df_final)

evaluator = RegressionEvaluator(
    labelCol="Log_Maas_USD",
    predictionCol="prediction",
    metricName="r2"
)

r2_score = evaluator.evaluate(predictions)
print(" TF-IDF + PCA GBT R² Skoru (stratified veriyle):", round(r2_score, 4))


 TF-IDF + PCA GBT R² Skoru (stratified veriyle): 0.0351


sonuc cok kotu yeni vektör olusturup özellikler dahil edilerek tekrar denenmeli sonuclar analiz edilmeli

In [27]:
assembler = VectorAssembler(
    inputCols=[
        "pozisyon_vec",
        "Deneyim_Seviyesi_Encoded",
        "Calisma_Tipi_Encoded",
        "Sirket_Ulke_Encoded",
        "Sirket_Buyuklugu_Encoded"
    ],
    outputCol="features_new"
)
