<a href="https://colab.research.google.com/github/rininuruls/Algoritma-Genetika/blob/main/classification_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, CountVectorizer, IDF,
    StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
from pyspark.sql import SparkSession

# Inisialisasi SparkSession
spark = SparkSession.builder \
    .appName("BeasiswaBAZNASClassification") \
    .getOrCreate()

# Load CSV
df = spark.read.csv("/content/Dataset_Beasiswa_BAZNAS_Indonesia.csv", header=True, inferSchema=True)

# Tampilkan data
df.show(5)


+---+-------+-------------------+-----------------+----+--------+----+----------------+--------------------+-------------------+-------------+
| ID|   Nama|Pendapatan_Keluarga|Jumlah_Tanggungan| IPK|Semester|Usia|Status_Orang_Tua|Aktivitas_Organisasi|Pekerjaan_Sampingan|Skor_Motivasi|
+---+-------+-------------------+-----------------+----+--------+----+----------------+--------------------+-------------------+-------------+
|  1|   Gita|            3213573|                5|3.94|       4|  19|           Yatim|               Aktif|                Ada|           91|
|  2|   Tari|            1525893|                4|3.23|       1|  18|     Yatim Piatu|         Tidak Aktif|                Ada|           91|
|  3|   Omar|            4610102|                4|2.66|       7|  24|           Yatim|               Aktif|          Tidak Ada|           69|
|  4|Kartini|            3169455|                3|3.32|       7|  22|           Yatim|         Tidak Aktif|                Ada|           58|

In [4]:
# 1. Buat label klasifikasi Beasiswa: 1 = Layak, 0 = Tidak Layak
df = df.withColumn(
    "label",
    when((df.IPK >= 3.2) & (df.Pendapatan_Keluarga < 4000000), 1).otherwise(0)
)

In [6]:
fitur_numerik = ["Pendapatan_Keluarga", "Jumlah_Tanggungan", "IPK", "Semester", "Usia", "Skor_Motivasi"]


In [7]:
# Indexing dan One-Hot Encoding
status_indexer = StringIndexer(inputCol="Status_Orang_Tua", outputCol="status_index", handleInvalid="keep")
status_encoder = OneHotEncoder(inputCol="status_index", outputCol="status_encoded")

organisasi_indexer = StringIndexer(inputCol="Aktivitas_Organisasi", outputCol="organisasi_index", handleInvalid="keep")
organisasi_encoder = OneHotEncoder(inputCol="organisasi_index", outputCol="organisasi_encoded")

pekerjaan_indexer = StringIndexer(inputCol="Pekerjaan_Sampingan", outputCol="pekerjaan_index", handleInvalid="keep")
pekerjaan_encoder = OneHotEncoder(inputCol="pekerjaan_index", outputCol="pekerjaan_encoded")

In [8]:
from pyspark.ml.feature import StandardScaler

# Gabungkan fitur numerik ke dalam vektor sebelum scaling
numerik_assembler = VectorAssembler(
    inputCols=["Pendapatan_Keluarga", "Jumlah_Tanggungan", "IPK", "Semester", "Usia", "Skor_Motivasi"],
    outputCol="fitur_numerik_vec"
)

# Scaling
numerik_scaler = StandardScaler(
    inputCol="fitur_numerik_vec",
    outputCol="fitur_numerik_scaled"
)

In [9]:
feature_assembler = VectorAssembler(
    inputCols=["fitur_numerik_scaled", "status_encoded", "organisasi_encoded", "pekerjaan_encoded"],
    outputCol="features"
)

In [10]:
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    seed=42,
    numTrees=100,
    maxDepth=5
)

In [13]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF

# Tokenisasi teks dari kolom Skor_Motivasi
tokenizer = Tokenizer(inputCol="Skor_Motivasi", outputCol="words")

# Hapus stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# Vectorisasi kata-kata
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="rawFeatures")

# TF-IDF
idf = IDF(inputCol="rawFeatures", outputCol="motivation_features")


In [24]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

# 1. Konversi nilai numerik ke string (sudah dilakukan)
df = df.withColumn("Skor_Motivasi_Str", col("Skor_Motivasi").cast("string"))

# 2. Text preprocessing
tokenizer = Tokenizer(inputCol="Skor_Motivasi_Str", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="title_features")

# 3. Categorical encoding (misalnya: Status_Orang_Tua)
status_indexer = StringIndexer(inputCol="Status_Orang_Tua", outputCol="status_index", handleInvalid="keep")
status_encoder = OneHotEncoder(inputCol="status_index", outputCol="status_encoded")

# 4. Combine features
feature_assembler = VectorAssembler(
    inputCols=["title_features", "status_encoded"],  # tambahkan fitur lain jika perlu
    outputCol="features"
)

# 5. Random Forest
classifier = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)

# 6. Pipeline lengkap
pipeline = Pipeline(stages=[
    tokenizer,
    remover,
    vectorizer,
    idf,
    status_indexer,
    status_encoder,
    feature_assembler,
    classifier
])


In [16]:
from pyspark.sql.functions import col

# Ubah Skor_Motivasi menjadi string untuk diproses sebagai teks
df = df.withColumn("Skor_Motivasi_Str", col("Skor_Motivasi").cast("string"))

In [17]:
tokenizer = Tokenizer(inputCol="Skor_Motivasi_Str", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="title_features")


In [22]:
# Split data menjadi train dan test
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)



In [25]:
# Latih model dengan data latih
model = pipeline.fit(train_df)


In [27]:
# Prediksi menggunakan data test
predictions = model.transform(test_df)

# Evaluasi akurasi model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label"
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Akurasi Model: {accuracy}")


Akurasi Model: 0.5432098765432098


In [30]:

from pyspark.sql.functions import when

# Tambahkan label_name dari label asli
predictions = predictions.withColumn(
    "label_name",
    when(predictions.label == 0, "Low")
    .when(predictions.label == 1, "Medium")
    .otherwise("High")
)

# Tambahkan predicted_label_name dari hasil prediksi
predictions = predictions.withColumn(
    "predicted_label_name",
    when(predictions.prediction == 0, "Low")
    .when(predictions.prediction == 1, "Medium")
    .otherwise("High")
)

In [None]:
save = predictions.select("judul", "harga", "terjual","lokasi", "label_name", "prediction")
save.write.csv("predictions.csv", header=True)