In [24]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression, NaiveBayes, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time
import gdown
import pandas as pd
#import pickle

In [25]:
# URL berbagi dari Google Drive
url = 'https://drive.google.com/uc?id=13ZePGua4XOzuTxI9omH5QnhOpt0LPRLc'
output = 'file.csv'
gdown.download(url, output, quiet=False)

file_path = 'file.csv'

# Inisialisasi Spark session
spark = SparkSession.builder.appName("InsuranceClassification").getOrCreate()


# Memuat dataset
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Menyiapkan fitur dan label
feature_columns = df.columns[:-1]  # Asumsikan kolom terakhir adalah label
label_column = df.columns[-1]

# Mengonversi kolom string menjadi indeks numerik
indexers = [
    StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
    for column in feature_columns if dict(df.dtypes)[column] == 'string'
]

for indexer in indexers:
    df = indexer.transform(df)

# Menyiapkan kolom fitur yang baru
indexed_feature_columns = [
    column + "_index" if dict(df.dtypes)[column] == 'string' else column
    for column in feature_columns
]

assembler = VectorAssembler(inputCols=indexed_feature_columns, outputCol="features")
data = assembler.transform(df).select("features", col(label_column).alias("label"))

# Mengonversi label menjadi indeks numerik
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
data = label_indexer.transform(data).select("features", "indexedLabel")

# Split data
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Inisialisasi evaluator
#evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

# Inisialisasi model Logistic Regression
lr = LogisticRegression(labelCol="indexedLabel", featuresCol="features", maxIter=10)

# Inisialisasi model Naive Bayes
nb = NaiveBayes(labelCol="indexedLabel", featuresCol="features")

# Inisialisasi model Random Forest
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10)

# List untuk menyimpan hasil dari masing-masing model
results = []

# Definisikan fungsi untuk melatih dan mengevaluasi model
def train_and_evaluate(model, model_name):
    start_time = time.time()
    trained_model = model.fit(train_data)
    training_time = time.time() - start_time
    
    # Evaluasi menggunakan MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
    predictions = trained_model.transform(test_data)
    
    # Menghitung metrik evaluasi
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    
    # Menambahkan hasil ke dalam list results
    results.append({
        "Model": model_name,
        "Training Time (s)": training_time,
        "Accuracy": accuracy,
        "F1-score": f1,
        "Precision": precision,
        "Recall" : recall
    })
    
    # Print classification report
    #print(f"Classification Report for {model_name}:")
    #predictions.select("indexedLabel", "prediction").show()

# Train dan evaluasi masing-masing model
train_and_evaluate(lr, "Log. Regression")
train_and_evaluate(nb, "Naive Bayes")
train_and_evaluate(rf, "Random Forest")

# Menampilkan hasil akhir
results_df = pd.DataFrame(results)
print("\nSummary of Results:")
print(results_df)

# Menutup SparkSession
#spark.stop()

Downloading...
From: https://drive.google.com/uc?id=13ZePGua4XOzuTxI9omH5QnhOpt0LPRLc
To: c:\Users\User\AppData\Local\Programs\Microsoft VS Code\file.csv
100%|██████████| 21.4M/21.4M [00:08<00:00, 2.55MB/s]



Summary of Results:
             Model  Training Time (s)  Accuracy  F1-score  Precision    Recall
0  Log. Regression           4.033404  0.876957  0.819468   0.769053  0.876957
1      Naive Bayes           1.279230  0.518574  0.599483   0.788506  0.518574
2    Random Forest           4.955732  0.876957  0.819468   0.769053  0.876957


In [26]:
nb_model = nb.fit(train_data)
model_path = "naive_bayes_model"
nb_model.write().overwrite().save(model_path)

#model export
#with open('modelNB.pkl','wb') as file:
    #pickle.dump(modelNB, file)
# Menutup SparkSession
spark.stop()