In [None]:
!pip install pyspark pandas matplotlib seaborn scikit-learn

In [None]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, hour
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, MinMaxScaler # Mungkin tidak perlu semua, tapi untuk jaga-jaga

spark = SparkSession.builder \
    .appName("MedanTrafficModelEvalViz_HadoopEnv") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("spark.hadoop.javax.jdo.option.ConnectionURL", "jdbc:postgresql://postgres-hive-metastore-db:5432/metastore") \
    .config("spark.hadoop.javax.jdo.option.ConnectionDriverName", "org.postgresql.Driver") \
    .config("spark.hadoop.javax.jdo.option.ConnectionUserName", "hiveuser") \
    .config("spark.hadoop.javax.jdo.option.ConnectionPassword", "hivepassword") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.driver.host", "localhost") \
    .enableHiveSupport() \
    .getOrCreate()

print("SparkSession berhasil diinisialisasi untuk lingkungan Hadoop.")

# Path output Gold Layer di HDFS
hdfs_gold_output_path = "hdfs://namenode:9000/data/gold/traffic_prediction_results.parquet"

# --- Sel 3: Muat Data Hasil Prediksi (Gold Layer) dari HDFS ---
print(f"Membaca data hasil prediksi dari HDFS: {hdfs_gold_output_path}")

try:
    final_gold_df = spark.read.parquet(hdfs_gold_output_path)
    print("Data hasil prediksi (Gold Layer) berhasil dibaca dari HDFS.")
    final_gold_df.printSchema()
    final_gold_df.show(5)
except Exception as e:
    print(f"Error membaca data hasil prediksi dari HDFS: {e}")
    print("Pastikan layanan HDFS dan Spark Master di Docker berjalan dan Spark Job ETL & ML telah selesai.")
    print("Jika Anda menjalankan ini di Colab tanpa koneksi ke Docker, bagian ini akan gagal.")
    spark.stop()

# --- Sel 4: Evaluasi Ulang Model (Opsional) ---
print("Melakukan evaluasi ulang model...")

evaluator_rmse = RegressionEvaluator(labelCol="actual_avg_speed_kmh", predictionCol="predicted_avg_speed_kmh", metricName="rmse")
rmse = evaluator_rmse.evaluate(final_gold_df)
print(f"Root Mean Squared Error (RMSE) yang dihitung ulang = {rmse}")

evaluator_mae = RegressionEvaluator(labelCol="actual_avg_speed_kmh", predictionCol="predicted_avg_speed_kmh", metricName="mae")
mae = evaluator_mae.evaluate(final_gold_df)
print(f"Mean Absolute Error (MAE) yang dihitung ulang = {mae}")

# --- Sel 5: Visualisasi Hasil Prediksi ---
print("Membuat visualisasi hasil prediksi...")

# Konversi ke Pandas DataFrame
results_df_pd = final_gold_df.toPandas()

# Line Chart: Predicted Average Speed by Road and Hour (Top N Roads)
plt.figure(figsize=(18, 10))
top_roads = results_df_pd['road_name'].value_counts().nlargest(5).index # Ambil 5 jalan dengan data terbanyak
filtered_results_pd = results_df_pd[results_df_pd['road_name'].isin(top_roads)]

sns.lineplot(data=filtered_results_pd, x='hour', y='predicted_avg_speed_kmh', hue='road_name', marker='o', palette='tab10')
plt.title('Predicted Average Speed by Road and Hour (Top 5 Roads)')
plt.xlabel('Hour of Day')
plt.ylabel('Predicted Average Speed (km/h)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Road Name', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Count Plot: Predicted Congestion Level Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=results_df_pd, x='predicted_congestion_level', palette='viridis', order=['High', 'Medium', 'Low'])
plt.title('Predicted Congestion Level Distribution')
plt.xlabel('Congestion Level')
plt.ylabel('Count')
plt.show()

# Scatter Plot: Actual vs Predicted Speed
plt.figure(figsize=(10, 8))
sns.scatterplot(data=results_df_pd, x='actual_avg_speed_kmh', y='predicted_avg_speed_kmh', alpha=0.5)
plt.title('Actual vs Predicted Average Speed')
plt.xlabel('Actual Average Speed (km/h)')
plt.ylabel('Predicted Average Speed (km/h)')
plt.plot([results_df_pd['actual_avg_speed_kmh'].min(), results_df_pd['actual_avg_speed_kmh'].max()],
         [results_df_pd['actual_avg_speed_kmh'].min(), results_df_pd['actual_avg_speed_kmh'].max()],
         color='red', linestyle='--', linewidth=2, label='Perfect Prediction')
plt.legend()
plt.grid(True)
plt.show()

# --- Sel Terakhir: Hentikan SparkSession ---
spark.stop()
print("Sesi Spark dihentikan.")