In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, when
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


spark = SparkSession.builder \
    .appName("US Accidents Severity Prediction") \
    .config("spark.driver.memory", "8g") \
    .master("local[*]") \
    .getOrCreate()

In [2]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("US_Accidents_March23.csv")

In [3]:
df2 = df \
    .drop("ID","Source", "Zipcode", "Timezone", "Airport_Code", "Amenity",
          "Bump", "Give_Way", "No_Exit", "Railway", "Description", "County",
          "Roundabout", "Station", "Stop", "Nautical_Twilight", "Astronomical_Twilight", "Country")

In [4]:
df3 = df2 \
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("End_TS", to_timestamp(col("End_Time"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("Duration", ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60).cast(DoubleType())) \
    .drop("Start_TS", "End_TS", "Start_Time", "End_Time")

df3.show()

+--------+------------------+------------------+-------+-------+------------+--------------------+------------+-----+-------------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+--------+--------+---------------+--------------+------------+--------------+--------------+--------+
|Severity|         Start_Lat|         Start_Lng|End_Lat|End_Lng|Distance(mi)|              Street|        City|State|  Weather_Timestamp|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Crossing|Junction|Traffic_Calming|Traffic_Signal|Turning_Loop|Sunrise_Sunset|Civil_Twilight|Duration|
+--------+------------------+------------------+-------+-------+------------+--------------------+------------+-----+-------------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-------------

In [5]:
selected_cols = [ "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)", "Crossing", "Traffic_Signal",
      "Wind_Speed(mph)", "Precipitation(in)", "Weather_Condition", "Wind_Direction", "Junction", "Duration","Severity",
      "Civil_Twilight", "Sunrise_Sunset", "State", "City_Cleaned", "Wind_Chill(F)",  "Street_Cleaned" ]


# Hangi sütunları işleyeceğimizi tanımla
columns_to_clean = ["City", "Street"]
top_n = 128


# Sık geçen değerleri belirleyip "Other" ile gruplayan fonksiyon
def clean_column(df, column_name, top_n=128):
    top_values_df = df.groupBy(column_name).count().orderBy(col("count").desc()).limit(top_n)
    top_values_list = [row[column_name] for row in top_values_df.collect()]
   
    cleaned_col_name = f"{column_name}_Cleaned"
    df = df.withColumn(
        cleaned_col_name,
        when(col(column_name).isin(top_values_list), col(column_name)).otherwise("Other")
    )
    return df


In [6]:
# Her sütun için işlemi uygula
for col_name in columns_to_clean:
    df3 = clean_column(df3, col_name, top_n=top_n)
    
df_selected = df3.select(*selected_cols)

categorical_cols = [
    "Weather_Condition", "Wind_Direction", "Civil_Twilight",
    "Sunrise_Sunset", "State", "City_Cleaned", "Street_Cleaned"
]


indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_Idx", handleInvalid="keep")
    for col in categorical_cols
]

feature_cols = ["Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition_Idx", "Wind_Direction_Idx", "Civil_Twilight_Idx",
    "Sunrise_Sunset_Idx", "State_Idx", "City_Cleaned_Idx", "Street_Cleaned_Idx"
]


assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)


# 7) Random Forest sınıflayıcı (parametrelere daha sonra grid ile dokunacağız)
rf = RandomForestClassifier(
    labelCol="Severity",
    featuresCol="features",
    seed=42
)

df_selected.show()

+--------------+-----------+------------+--------------+--------+--------------+---------------+-----------------+-----------------+--------------+--------+--------+--------+--------------+--------------+-----+------------+-------------+--------------+
|Temperature(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Crossing|Traffic_Signal|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Wind_Direction|Junction|Duration|Severity|Civil_Twilight|Sunrise_Sunset|State|City_Cleaned|Wind_Chill(F)|Street_Cleaned|
+--------------+-----------+------------+--------------+--------+--------------+---------------+-----------------+-----------------+--------------+--------+--------+--------+--------------+--------------+-----+------------+-------------+--------------+
|          36.9|       91.0|       29.68|          10.0|   false|         false|           NULL|             0.02|       Light Rain|          Calm|   false|   314.0|       3|         Night|         Night|   OH|      Dayton|         NULL|    

In [7]:
# 9. NA temizliği ve veri bölme
df_no_na = df_selected.dropna().cache()
train, test = df_no_na.randomSplit([0.8, 0.2], seed=42)

df_no_na.show()

+--------------+-----------+------------+--------------+--------+--------------+---------------+-----------------+-----------------+--------------+--------+--------+--------+--------------+--------------+-----+------------+-------------+--------------+
|Temperature(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Crossing|Traffic_Signal|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Wind_Direction|Junction|Duration|Severity|Civil_Twilight|Sunrise_Sunset|State|City_Cleaned|Wind_Chill(F)|Street_Cleaned|
+--------------+-----------+------------+--------------+--------+--------------+---------------+-----------------+-----------------+--------------+--------+--------+--------+--------------+--------------+-----+------------+-------------+--------------+
|          37.9|       97.0|       29.63|           7.0|   false|         false|            3.5|             0.03|       Light Rain|           SSW|   false|    30.0|       3|           Day|           Day|   OH|       Other|         35.5|    

In [8]:
# 8. Pipeline
pipeline = Pipeline(stages=indexers + [assembler, rf])

In [9]:
# 9) Hiperparametre ızgarası
paramGrid = (ParamGridBuilder()
    .addGrid(rf.numTrees, [20, 50, 100])
    .addGrid(rf.maxDepth, [5, 10, 15])
    .addGrid(rf.maxBins,  [216, 256])
    .build()
)

In [11]:
# 1) Evaluator'ı tanımla
evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity",
    predictionCol="prediction",
    metricName="f1"    # ya da "accuracy", "weightedPrecision", vs.
)

In [12]:
cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=2
)

In [13]:
# 11) Veri böl (önce eğitim, sonra CV.fit)
train, test = df_selected.randomSplit([0.8,0.2], seed=42)

cvModel = cv.fit(train)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Users\aslay\bert_env\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\aslay\AppData\Local\Temp\ipykernel_15612\3014251591.py", line 4, in <module>
    cvModel = cv.fit(train)
              ^^^^^^^^^^^^^
  File "C:\spark\python\pyspark\ml\base.py", line 205, in fit
    return self._fit(dataset)
           ^^^^^^^^^^^^^^^^^^
  File "C:\spark\python\pyspark\ml\tuning.py", line 847, in _fit
    for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks):
  File "C:\Users\aslay\AppData\Local\Programs\Python\Python312\Lib\multiprocessing\pool.py", line 873, in next
    raise value
  File "C:\Users\aslay\AppData\Local\Programs\Python\Python312\Lib\multiprocessing\pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "C:\spark\python\pys

ConnectionRefusedError: [WinError 10061] Hedef makine etkin olarak reddettiğinden bağlantı kurulamadı

In [None]:
# 12) En iyi parametreler
bestRF = cvModel.bestModel.stages[-1]
print("=== En İyi RF Parametreleri ===")
print(f" numTrees = {bestRF.getNumTrees}")
print(f" maxDepth = {bestRF.getOrDefault('maxDepth')}")

In [None]:
model.write().overwrite().save("models/us_accidents_severity_rf")

In [None]:
# 11. Tahmin üret
preds = cvModel.transform(test)
df_no_na.unpersist()

evaluator = MulticlassClassificationEvaluator(labelCol="Severity", predictionCol="prediction")

accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)
f1 = evaluator.setMetricName("f1").evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Weighted Precision: {precision:.4f}")
print(f"Weighted Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8523
Weighted Precision: 0.8104
Weighted Recall: 0.8523
F1 Score: 0.7844


In [8]:
df_no_na.printSchema()  # Display the schema of the cleaned DataFrame

ConnectionRefusedError: [WinError 10061] Hedef makine etkin olarak reddettiğinden bağlantı kurulamadı

In [2]:
# %% [code]
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, when
from pyspark.sql.types import DoubleType

# 1) SparkSession oluştur
spark = SparkSession.builder \
    .appName("US Accidents Severity CV") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

# 2) CSV'i oku ve gereksiz sütunları at
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)
df = df.drop(
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity",
    "Bump","Give_Way","No_Exit","Railway","Description","County",
    "Roundabout","Station","Stop","Nautical_Twilight",
    "Astronomical_Twilight","Country"
)

# 3) Başlangıç / Bitiş zamanlarını timestamp'e çevir, Duration (dakika) hesapla
df = (
    df.withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
      .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
      .withColumn(
          "Duration",
          ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
          .cast(DoubleType())
      )
      .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 4) City/Street için cardinality'yi top 32 ile sınırlayan fonksiyon
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n).collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City", "Street"]:
    df = clean_column(df, c, top_n=32)

# 5) Modelde kullanacağımız sütunları tanımla
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Crossing", "Traffic_Signal", "Wind_Speed(mph)", "Precipitation(in)",
    "Weather_Condition", "Wind_Direction", "Junction", "Duration", "Severity",
    "Civil_Twilight", "Sunrise_Sunset", "State", "City_Cleaned",
    "Wind_Chill(F)", "Street_Cleaned"
]

# 6) Kategorik sütunlar ve VectorAssembler için feature listesi
categorical_cols = [
    "Weather_Condition", "Wind_Direction",
    "Civil_Twilight",     "Sunrise_Sunset",
    "State",              "City_Cleaned",
    "Street_Cleaned"
]

feature_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition_Idx", "Wind_Direction_Idx",
    "Civil_Twilight_Idx",     "Sunrise_Sunset_Idx",
    "State_Idx",              "City_Cleaned_Idx",
    "Street_Cleaned_Idx"
]

# 7) Son olarak seç, NA'ları at ve cachele
df_selected = df.select(*selected_cols)
df_no_na   = df_selected.dropna().cache()


In [None]:
# %% [code]
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline

# 8) Pipeline + RF + Hiperparametre Izgarası
indexers = [
    StringIndexer(inputCol=c, outputCol=c + "_Idx", handleInvalid="keep")
    for c in categorical_cols
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

rf = RandomForestClassifier(
    labelCol="Severity",
    featuresCol="features"
)

paramGrid = (ParamGridBuilder()
    .addGrid(rf.numTrees, [20, 50])
    .addGrid(rf.maxDepth, [5, 10])
    .addGrid(rf.maxBins, [256])
    .build()
)

evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity",
    predictionCol="prediction",
    metricName="f1"
)

pipeline = Pipeline(stages=indexers + [assembler, rf])

tvs = TrainValidationSplit(
    estimator          = pipeline,
    estimatorParamMaps = paramGrid,
    evaluator          = evaluator,
    trainRatio         = 0.8,
    parallelism        = 1
)

# 9) Eğitim/Test böl ve CV'yi çalıştır
train, test = df_no_na.randomSplit([0.8, 0.2], seed=42)
tvsModel = tvs.fit(train)
bestModel = tvsModel.bestModel

print("🏆 En iyi CV F1 skoru:", max(cvModel.avgMetrics))

# 10) Test kümesinde değerlendir
preds = cvModel.transform(test)
test_f1 = evaluator.evaluate(preds)
test_acc = MulticlassClassificationEvaluator(
    labelCol="Severity",
    predictionCol="prediction",
    metricName="accuracy"
).evaluate(preds)

print(f"Test F1 Skoru:    {test_f1:.4f}")
print(f"Test Doğruluk:    {test_acc:.4f}")

Py4JJavaError: An error occurred while calling o5138.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 12 in stage 251.0 failed 1 times, most recent failure: Lost task 12.0 in stage 251.0 (TID 3612) (MSI executor driver): java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22(RandomForest.scala:651)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22$adapted(RandomForest.scala:647)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$6451/694568505.apply(Unknown Source)
	at scala.Array$.tabulate(Array.scala:418)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$21(RandomForest.scala:647)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$6438/1019762470.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$Lambda$2859/626766273.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2605/184492444.apply(Unknown Source)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:738)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:737)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:663)
	at org.apache.spark.ml.tree.impl.RandomForest$.runBagged(RandomForest.scala:208)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:302)
	at org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:168)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:139)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:47)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:78)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22(RandomForest.scala:651)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22$adapted(RandomForest.scala:647)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$6451/694568505.apply(Unknown Source)
	at scala.Array$.tabulate(Array.scala:418)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$21(RandomForest.scala:647)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$6438/1019762470.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$Lambda$2859/626766273.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2605/184492444.apply(Unknown Source)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aslay\AppData\Local\Programs\Python\Python312\Lib\socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] Varolan bir bağlantı uzaktaki bir ana bilgisayar tarafından zorla kapatıldı

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j

In [None]:
# %% [code]
# 11) Eğitilmiş Modeli Kaydet
# En iyi pipeline modeli (PipelineModel) cvModel.bestModel
best_pipeline_model = cvModel.bestModel

# Modeli disk'e yaz
best_pipeline_model.write() \
    .overwrite() \
    .save("models/us_accidents_severity_rf_cv_best")

print("✅ Model başarıyla kaydedildi: models/us_accidents_severity_rf_cv_best")


In [1]:
# %% [markdown]
# ## 1) SparkSession’ı daha yüksek bellek konfigürasyonuyla aç

import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("US Accidents Severity CV - Test") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()


In [2]:
# %% [markdown]
# ## 2) Veri Hazırlığı

from pyspark.sql.functions import col, to_timestamp, unix_timestamp, when
from pyspark.sql.types import DoubleType

# CSV’yi oku
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# Gereksiz sütunları at
df = df.drop(
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity",
    "Bump","Give_Way","No_Exit","Railway","Description","County",
    "Roundabout","Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
)

# Timestamp → Duration (dakika)
df = df.withColumn("Start_TS",  to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("End_TS",    to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("Duration", ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60).cast(DoubleType())) \
       .drop("Start_TS","End_TS","Start_Time","End_Time")

# Cardinality’si yüksek City/Street’i top_n=32 dışındakileri "Other" yap
def clean_column(df, column_name, top_n=32):
    top_vals = [r[column_name] for r in
                df.groupBy(column_name).count()
                  .orderBy(col("count").desc())
                  .limit(top_n).collect()]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df = clean_column(df, c, top_n=32)

# Kullanacağımız sütunlar
selected_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Crossing","Traffic_Signal","Wind_Speed(mph)","Precipitation(in)",
    "Weather_Condition","Wind_Direction","Junction","Duration","Severity",
    "Civil_Twilight","Sunrise_Sunset","State","City_Cleaned",
    "Wind_Chill(F)","Street_Cleaned"
]
df_selected = df.select(*selected_cols)

# NA’ları at ve cache
df_no_na = df_selected.dropna().cache()

In [3]:
# %% [markdown]
# ## 3) Pipeline + RF + ParamGrid + TrainValidationSplit

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml import Pipeline

# Kategorikleri indexle
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_Idx", handleInvalid="keep")
    for c in categorical_cols
]

# Assembler
feature_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Weather_Condition_Idx","Wind_Direction_Idx","Civil_Twilight_Idx",
    "Sunrise_Sunset_Idx","State_Idx","City_Cleaned_Idx","Street_Cleaned_Idx"
]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")

# RandomForest (parametreler TrainValidationSplit’ten gelecek)
rf = RandomForestClassifier(labelCol="Severity", featuresCol="features")

# Daraltılmış parametre ızgarası
paramGrid = (ParamGridBuilder()
    .addGrid(rf.numTrees, [20, 50])      # 100 kaldırıldı
    .addGrid(rf.maxDepth, [5, 10])       # 15 kaldırıldı
    .addGrid(rf.maxBins, [256])          # 512 kaldırıldı
    .build()
)

# F1 metriğiyle evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity",
    predictionCol="prediction",
    metricName="f1"
)

# Pipeline
pipeline = Pipeline(stages=indexers + [assembler, rf])

# TrainValidationSplit (paralellik=1, Windows'ta güvenli)
tvs = TrainValidationSplit(
    estimator          = pipeline,
    estimatorParamMaps = paramGrid,
    evaluator          = evaluator,
    trainRatio         = 0.8,
    parallelism        = 1
)


In [5]:
# %% [markdown]
# ## 4) Küçük veri örneğiyle fit & değerlendir & kaydet (ilerlemeyi takip edelim)

# Spark loglarını INFO seviyesinde göster
spark.sparkContext.setLogLevel("INFO")

# 4.1) Split
print("🔀 4.1) Veriyi train/test olarak bölüyoruz…")
train, test = df_no_na.randomSplit([0.8, 0.2], seed=42)
print(f"    → train: {train.count()} satır, test: {test.count()} satır")

# 4.2) Sadece %20 örnek al
print("🔄 4.2) Train’den %20 örnek alınıyor (küçük eğitim seti)…")
small_train = train.sample(fraction=0.2, seed=42)
print(f"    → small_train: {small_train.count()} satır")

# 4.3) Fit
print("⚙️ 4.3) TrainValidationSplit ile model eğitimi başlıyor…")
tvsModel = tvs.fit(small_train)
print("    ✅ Eğtirim tamamlandı")

bestModel = tvsModel.bestModel

# Hangi parametreler en iyiye denk düştü?
bestParams = tvsModel.getEstimatorParamMaps()[tvsModel.validationMetrics.index(max(tvsModel.validationMetrics))]
print(f"    🔍 En iyi parametreler: numTrees={bestParams[rf.numTrees]}, "
      f"maxDepth={bestParams[rf.maxDepth]}, maxBins={bestParams[rf.maxBins]}")
print(f"    🏆 En iyi TVS F1 skoru (küçük veri): {max(tvsModel.validationMetrics):.4f}")

# 4.4) Test kümesinde değerlendir
print("📊 4.4) Test kümesinde değerlendirme yapılıyor…")
preds = bestModel.transform(test)
f1_test = evaluator.evaluate(preds)
acc_test = MulticlassClassificationEvaluator(
    labelCol="Severity",
    predictionCol="prediction",
    metricName="accuracy"
).evaluate(preds)

print(f"    Test F1    : {f1_test:.4f}")
print(f"    Test Acc   : {acc_test:.4f}")

# 4.5) Modeli kaydet
print("💾 4.5) Modeli kaydediyoruz…")
bestModel.write().overwrite().save("models/us_accidents_rf_tvs_small")
print("    ✅ Model kaydedildi → models/us_accidents_rf_tvs_small")


🔀 4.1) Veriyi train/test olarak bölüyoruz…
    → train: 4175117 satır, test: 1042802 satır
🔄 4.2) Train’den %20 örnek alınıyor (küçük eğitim seti)…
    → small_train: 835471 satır
⚙️ 4.3) TrainValidationSplit ile model eğitimi başlıyor…
    ✅ Eğtirim tamamlandı
    🔍 En iyi parametreler: numTrees=50, maxDepth=10, maxBins=256
    🏆 En iyi TVS F1 skoru (küçük veri): 0.7914
📊 4.4) Test kümesinde değerlendirme yapılıyor…
    Test F1    : 0.7919
    Test Acc   : 0.8544
💾 4.5) Modeli kaydediyoruz…
    ✅ Model kaydedildi → models/us_accidents_rf_tvs_small


In [4]:
# ——————————————————————————————————————
# A) Özellik-öncesi pipeline’ı çalıştır ve cache’le
print("☑️ Aşama A: Özelliklerin üretilmesi (index + assemble)…")
pipeline_features = Pipeline(stages=indexers + [assembler])
df_feats = pipeline_features.fit(df_no_na).transform(df_no_na) \
    .select("features", "Severity") \
    .cache()
# materialize cache
print(f"   → Toplam örnek: {df_feats.count()} satır\n")

☑️ Aşama A: Özelliklerin üretilmesi (index + assemble)…
   → Toplam örnek: 5217919 satır



In [5]:
print("☑️ Aşama B: RF final model eğitimi başlıyor…")
rf_final = RandomForestClassifier(
    labelCol="Severity",
    featuresCol="features",
    numTrees=50,
    maxDepth=10,
    maxBins=256
)
model_full = rf_final.fit(df_feats)
print("   ✅ Final model eğitimi tamamlandı\n")

# ——————————————————————————————————————
# C) Modeli kaydet
print("💾 Final modeli kaydediyoruz…")
model_full.write().overwrite().save("models/us_accidents_rf_final")
print("   ✅ Model kaydedildi → models/us_accidents_rf_final")

☑️ Aşama B: RF final model eğitimi başlıyor…


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Users\aslay\bert_env\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\aslay\AppData\Local\Temp\ipykernel_28620\3752981426.py", line 9, in <module>
    model_full = rf_final.fit(df_feats)
                 ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\spark\python\pyspark\ml\base.py", line 205, in fit
    return self._fit(dataset)
           ^^^^^^^^^^^^^^^^^^
  File "C:\spark\python\pyspark\ml\wrapper.py", line 381, in _fit
    java_model = self._fit_java(dataset)
                 ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\spark\python\pyspark\ml\wrapper.py", line 378, in _fit_java
    return self._java_obj.fit(dataset._jdf)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\java_gateway.py", line 1322, in __call__
    return_value = get_return_value(
                   ^^^^

ConnectionRefusedError: [WinError 10061] Hedef makine etkin olarak reddettiğinden bağlantı kurulamadı

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, when
from pyspark.sql.types import DoubleType

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GBTClassifier
)
# If you have XGBoost4J‑Spark installed, uncomment:
from sparkxgb import XGBoostClassifier

from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator

# SparkSession
spark = SparkSession.builder \
    .appName("US Accidents Severity – Full Pipeline") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

In [2]:
# 2.1) Read CSV & drop unwanted columns
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df = df.drop(*drop_cols)

# 2.2) Compute Duration (in minutes) from Start_Time / End_Time
df = (
    df
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.3) Reduce cardinality of City & Street (keep top 32, rest → "Other")
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df = clean_column(df, c, top_n=32)

# 2.4) Select only the features we want
selected_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Weather_Condition","Wind_Direction","Junction","Duration","Severity",
    "Civil_Twilight","Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
df_selected = df.select(*selected_cols)

# 2.5) Drop any rows with nulls & cache
df_no_na = df_selected.dropna().cache()
print(f"✅ df_no_na ready: {df_no_na.count():,} rows, {len(df_no_na.columns)} columns")


✅ df_no_na ready: 5,217,919 rows, 18 columns


In [3]:
# 3.1) Categorical columns & indexers
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_Idx", handleInvalid="keep")
    for c in categorical_cols
]

# 3.2) Numeric + indexed categorical → feature vector
feature_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal"
] + [c + "_Idx" for c in categorical_cols]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)


In [4]:
# 4.1) Full train/test split
train_full, test_full = df_no_na.randomSplit([0.8,0.2], seed=42)
print(f"🔀 Full split → train: {train_full.count():,}, test: {test_full.count():,}")

# 4.2) From train_full, get small_train & small_val (each ~20% of full)
small_train, small_val, _ = train_full.randomSplit([0.2,0.2,0.6], seed=42)
print(f"🔄 small_train: {small_train.count():,} rows")
print(f"🔄 small_val  : {small_val.count():,} rows")


🔀 Full split → train: 4,175,117, test: 1,042,802
🔄 small_train: 835,471 rows
🔄 small_val  : 836,531 rows


In [8]:
# 5.1) Define the classifiers to compare
from pyspark.ml.classification import OneVsRest, NaiveBayes
from pyspark.ml.classification import (
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GBTClassifier,
    OneVsRest,
)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# 1) Define models, upping maxBins for trees & OvR parallelism
models_to_try = {
    "LogisticRegression": LogisticRegression(
        labelCol="Severity", featuresCol="features",
        maxIter=20, family="multinomial"
    ),

    "DecisionTree": DecisionTreeClassifier(
        labelCol="Severity", featuresCol="features",
        maxDepth=10, maxBins=512
    ),

    "RandomForest": RandomForestClassifier(
        labelCol="Severity", featuresCol="features",
        numTrees=20, maxBins=512
    ),

    "GBT_OvR": OneVsRest(
        classifier=GBTClassifier(
            labelCol="Severity", featuresCol="features",
            maxIter=20, maxBins=512
        ),
        labelCol="Severity", featuresCol="features"
    ),

    # (optional) if you have XGBoost4J-Spark
    # "XGBoost_OvR": OneVsRest(
    #     classifier=XGBoostClassifier(
    #         labelCol="Severity", featuresCol="features",
    #         numRound=50
    #     ),
    #     labelCol="Severity", featuresCol="features"
    # ),

    "NaiveBayes": NaiveBayes(
        labelCol="Severity", featuresCol="features",
        smoothing=1.0, modelType="multinomial"
    ),
}

# 2) Evaluators
f1_evaluator  = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="prediction", metricName="f1"
)
acc_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="prediction", metricName="accuracy"
)

# 3) Loop over each model on small_train / small_val
results = []
for name, clf in models_to_try.items():
    print(f"\n⚙️  Training & evaluating → {name}")
    pipe  = Pipeline(stages=indexers + [assembler, clf])
    model = pipe.fit(small_train)

    preds = model.transform(small_val)
    f1, acc = (
        f1_evaluator.evaluate(preds),
        acc_evaluator.evaluate(preds)
    )

    path = f"models/us_accidents_{name.lower()}_small"
    model.write().overwrite().save(path)

    print(f"    ▶ small_val  F1 = {f1:.4f}, Acc = {acc:.4f}")
    print(f"    💾 Saved → {path}")
    results.append((name, f1, acc, path))

# 4) Pick best by F1
best_name, best_f1, best_acc, best_path = max(results, key=lambda x: x[1])
print(f"\n🏆 Best small model → {best_name} (F1={best_f1:.4f}, Acc={best_acc:.4f})")



⚙️  Training & evaluating → LogisticRegression
    ▶ small_val  F1 = 0.7842, Acc = 0.8517
    💾 Saved → models/us_accidents_logisticregression_small

⚙️  Training & evaluating → DecisionTree
    ▶ small_val  F1 = 0.7961, Acc = 0.8537
    💾 Saved → models/us_accidents_decisiontree_small

⚙️  Training & evaluating → RandomForest
    ▶ small_val  F1 = 0.7865, Acc = 0.8526
    💾 Saved → models/us_accidents_randomforest_small

⚙️  Training & evaluating → GBT_OvR


Py4JJavaError: An error occurred while calling o4428.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 1675.0 failed 1 times, most recent failure: Lost task 9.0 in stage 1675.0 (TID 27917) (MSI executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	... 24 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:738)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:737)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.confusions$lzycompute(MulticlassMetrics.scala:61)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.confusions(MulticlassMetrics.scala:52)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.labelCountByClass$lzycompute(MulticlassMetrics.scala:66)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.labelCountByClass(MulticlassMetrics.scala:64)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.weightedFMeasure(MulticlassMetrics.scala:227)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.weightedFMeasure$lzycompute(MulticlassMetrics.scala:235)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.weightedFMeasure(MulticlassMetrics.scala:235)
	at org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator.evaluate(MulticlassClassificationEvaluator.scala:152)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	... 24 more


In [1]:
# %% [markdown]
# US Accidents Severity Prediction — Model Karşılaştırması
# LightGBM, XGBoost, LogisticRegression, DecisionTree, RandomForest, GBT

# %% [markdown]
# **1) Spark Session & Kütüphaneler**
# - findspark ile SparkSession başlatılıyor

# %%
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, when
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml import Pipeline

spark = SparkSession.builder \
    .appName("US Accidents Severity - Model Comparison") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

# %% [markdown]
# **2) Veri Hazırlığı**
# - CSV okuma, gereksiz sütunları atma
# - Start/End zamanı → Duration
# - City/Street cardinality düşürme (top_n=32)
# - Seçilen sütunlar, NA temizleme

# %%
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)
# Gerekli drop'lar
drop_cols = ["ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
             "Give_Way","No_Exit","Railway","Description","County","Roundabout",
             "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"]
df2 = df.drop(*drop_cols)
# Duration hesaplama
df3 = (df2
       .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
       .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
       .withColumn("Duration",
                   ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
                   .cast(DoubleType()))
       .drop("Start_TS","End_TS","Start_Time","End_Time")
)
# Cardinality düşürme

def clean_column(df, column_name, top_n=32):
    top_vals = [r[column_name] for r in
                df.groupBy(column_name).count()
                  .orderBy(col("count").desc())
                  .limit(top_n).collect()]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )
for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)
# Seçilen sütunlar
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition", "Wind_Direction", "Junction", "Duration", "Severity",
    "Civil_Twilight", "Sunrise_Sunset", "State", "City_Cleaned", "Street_Cleaned"
]
df_selected = df3.select(*selected_cols)
# Kategorik ve feature listeleri
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
feature_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal"
] + [c + "_Idx" for c in categorical_cols]
# NA temizle & cache

df_no_na = df_selected.dropna().cache()
print(f"✅ Hazır veri: {df_no_na.count():,} satır, {len(df_no_na.columns)} sütun")



✅ Hazır veri: 5,217,919 satır, 18 sütun


In [1]:
# %% [markdown]
# # 1) Spark setup + imports

# %% [python]
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, when
from pyspark.sql.types import DoubleType

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.classification import (
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    NaiveBayes,
    GBTClassifier,
    OneVsRest
)

# start session
spark = SparkSession.builder \
    .appName("US Accidents – Multi‑Model Small‑Train") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .config("spark.driver.maxResultSize","2g") \
    .getOrCreate()

# %% [markdown]
# # 2) Data prep & `df_no_na`

# %% [python]
# 2.1) Read + drop
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)
drop_cols = ["ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
             "Give_Way","No_Exit","Railway","Description","County","Roundabout",
             "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"]
df = df.drop(*drop_cols)

# 2.2) Duration (mins)
df = df.withColumn("Start_TS", to_timestamp("Start_Time")) \
       .withColumn("End_TS",   to_timestamp("End_Time")) \
       .withColumn("Duration",
           ((unix_timestamp("End_TS") - unix_timestamp("Start_TS"))/60).cast(DoubleType())
       ) \
       .drop("Start_TS","End_TS","Start_Time","End_Time")

# 2.3) Cardinality‑reduce City & Street to top 32
def clean_column(df, colname, top_n=32):
    tops = [r[colname]
            for r in df.groupBy(colname).count()
                      .orderBy("count", ascending=False)
                      .limit(top_n).collect()]
    return df.withColumn(colname+"_Cleaned",
            when(col(colname).isin(tops), col(colname)).otherwise("Other"))

for c in ("City","Street"):
    df = clean_column(df, c, top_n=32)

# 2.4) Select features + Severity
selected = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Weather_Condition","Wind_Direction","Junction","Duration",
    "Civil_Twilight","Sunrise_Sunset","State","City_Cleaned","Street_Cleaned",
    "Severity"
]
df = df.select(*selected)

# 2.5) Drop NA & cache
df_no_na = df.dropna().cache()
print(f"✅ df_no_na: {df_no_na.count():,} rows × {len(df_no_na.columns)} cols")

# %% [markdown]
# # 3) Label‐index + categorical indexers + assembler

# %% [python]
# 3.1) Severity → label (0…3)
label_indexer = StringIndexer(inputCol="Severity", outputCol="label", handleInvalid="keep")

# 3.2) categoricals
cats = ["Weather_Condition","Wind_Direction","Civil_Twilight",
        "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"]
cat_indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_Idx", handleInvalid="keep")
    for c in cats
]

# 3.3) feature columns
numeric = ["Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
           "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal"]
features = numeric + [c+"_Idx" for c in cats]

assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid="skip")

# %% [markdown]
# # 4) Train/Test split + small subset

# %% [python]
train, test = df_no_na.randomSplit([0.8,0.2], seed=42)
print(f"🔀 train {train.count():,}, test {test.count():,}")

small_train, small_val = train.randomSplit([0.2,0.8], seed=42)
print(f"🔄 small_train {small_train.count():,}, small_val {small_val.count():,}")




✅ df_no_na: 5,217,919 rows × 18 cols
🔀 train 4,175,117, test 1,042,802
🔄 small_train 835,471, small_val 3,339,646

⚙️ Training lr on small_train…
   → lr  small F1: 0.7845, Acc: 0.8520
   ✅ saved → models/us_accidents_lr_small

⚙️ Training dt on small_train…
   → dt  small F1: 0.7912, Acc: 0.8537
   ✅ saved → models/us_accidents_dt_small

⚙️ Training rf on small_train…
   → rf  small F1: 0.7843, Acc: 0.8522
   ✅ saved → models/us_accidents_rf_small

⚙️ Training nb on small_train…


Py4JJavaError: An error occurred while calling o216.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 321.0 failed 1 times, most recent failure: Lost task 9.0 in stage 321.0 (TID 2584) (MSI executor driver): java.lang.RuntimeException: Vector values MUST NOT be Negative, NaN or Infinity, but got [-2.9,72.0,30.2,8.0,10.4,0.0,-19.8,0.0,5.0,5.0,1.0,1.0,8.0,12.0,0.0]
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1(ObjectHashAggregateExec.scala:92)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1$adapted(ObjectHashAggregateExec.scala:90)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:880)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:880)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.lang.RuntimeException: Vector values MUST NOT be Negative, NaN or Infinity, but got [-2.9,72.0,30.2,8.0,10.4,0.0,-19.8,0.0,5.0,5.0,1.0,1.0,8.0,12.0,0.0]
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1(ObjectHashAggregateExec.scala:92)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1$adapted(ObjectHashAggregateExec.scala:90)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:880)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:880)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)


In [2]:
%pip install xgboost lightgbm catboost scikit-learn

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.2.0-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-1.47.1-py3-none-any.whl.metadata (11 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 3.4 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 3.3 MB/s eta 0:00:00
Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl (102.4 MB)
   ---------------------------------------- 0.0/102.4 MB ? eta -:--:--
   ---------------------

In [3]:
# Spark modelleri
spark_models = {
    "GBT": GBTClassifier(labelCol="label", featuresCol="features", maxIter=50),
    "LogisticRegression": LogisticRegression(labelCol="label", featuresCol="features", maxIter=20),
    "DecisionTree": DecisionTreeClassifier(labelCol="label", featuresCol="features", maxBins=256, maxDepth=10),
    "RandomForest": RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50, maxBins=256),
    
}

for name, clf in spark_models.items():
    print(f"\n⚙️ Training {name} on small_train…")
    pipe = Pipeline(stages=[label_indexer] + cat_indexers + [assembler, clf])
    model = pipe.fit(small_train)
    preds = model.transform(small_val)
    f1 = f1_eval.evaluate(preds)
    acc = acc_eval.evaluate(preds)
    print(f"   → {name}: F1={f1:.4f}, Acc={acc:.4f}")
    path = f"models/us_accidents_{name.lower()}_small"
    model.write().overwrite().save(path)
    print(f"   💾 saved → {path}")

# %% [python]
# 5) scikit‐learn / XGBoost / LightGBM / CatBoost Experiments (pandas örneği)
pd_sample = (
    small_train.orderBy(rand()).limit(50000).toPandas()
)
X = pd_sample[[*feature_cols]].astype(float)
y = pd_sample["Severity"].astype(int)

pd_val = (
    small_val.orderBy(rand()).limit(10000).toPandas()
)
X_val = pd_val[[*feature_cols]].astype(float)
y_val = pd_val["Severity"].astype(int)



⚙️ Training GBT on small_train…


Py4JJavaError: An error occurred while calling o3423.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 347.0 failed 1 times, most recent failure: Lost task 7.0 in stage 347.0 (TID 2793) (MSI executor driver): java.lang.RuntimeException: Labels MUST be in {0, 1}, but got 3.0
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$aggregate$2(RDD.scala:1226)
	at org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2487)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2488)
	at org.apache.spark.rdd.RDD.$anonfun$aggregate$1(RDD.scala:1228)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.aggregate(RDD.scala:1221)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:125)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:333)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:61)
	at org.apache.spark.ml.classification.GBTClassifier.$anonfun$train$1(GBTClassifier.scala:201)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:170)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:58)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:78)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.RuntimeException: Labels MUST be in {0, 1}, but got 3.0
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$aggregate$2(RDD.scala:1226)
	at org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2487)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [4]:
# %% [python]
# 1) Spark ve Kütüphaneleri Başlat
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when, rand
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier,
    RandomForestClassifier, GBTClassifier
)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import os

spark = SparkSession.builder \
    .appName("US Accidents Severity Experiments") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .config("spark.driver.maxResultSize","2g") \
    .getOrCreate()

# %% [python]
# 2) Veri Hazırlığı
# 2.1) CSV'yi oku
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.2) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.3) Start/End → Duration hesapla
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.4) City/Street cardinality düşür

def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.5) Sütunları seç
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition", "Wind_Direction", "Junction", "Duration", "Severity",
    "Civil_Twilight", "Sunrise_Sunset", "State", "City_Cleaned", "Street_Cleaned"
]
df_selected = df3.select(*selected_cols)

# 2.6) Kategorik ve feature listeleri
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
feature_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal"
] + [c + "_Idx" for c in categorical_cols]

# 2.7) NA at ve cache

df_no_na = df_selected.dropna().cache()
print(f"✅ df_no_na hazır: {df_no_na.count():,} kayıt, {len(df_no_na.columns)} sütun")

# %% [python]
# 3) Train/Test Bölme ve Küçük Alt Küme Oluşturma
train, test = df_no_na.randomSplit([0.8,0.2], seed=42)
print(f"🔀 Bölme → train: {train.count():,}, test: {test.count():,}")

# Küçük alt küme (spark tarafı)
small = train.sample(fraction=0.2, seed=42)
small_train, small_val = small.randomSplit([0.8,0.2], seed=42)
print(f"🔄 small_train: {small_train.count():,}, small_val: {small_val.count():,}")

# %% [python]
# 4) Spark ML ile küçük alt kümede modelleri dene ve kaydet
# Label indexer
label_indexer = StringIndexer(inputCol="Severity", outputCol="label", handleInvalid="keep")
# Categorical indexers
cat_indexers = [StringIndexer(inputCol=c, outputCol=c+"_Idx", handleInvalid="keep")
                for c in categorical_cols]
# Assembler\ nassembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")
# Evaluatorlar
f1_eval = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)



✅ df_no_na hazır: 5,217,919 kayıt, 18 sütun
🔀 Bölme → train: 4,175,117, test: 1,042,802
🔄 small_train: 668,070, small_val: 167,401


In [6]:
# Spark modelleri
spark_models = {
    "LogisticRegression": LogisticRegression(labelCol="label", featuresCol="features", maxIter=20),
    "DecisionTree": DecisionTreeClassifier(labelCol="label", featuresCol="features", maxBins=256, maxDepth=10),
    "RandomForest": RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50, maxBins=256),
    
}

for name, clf in spark_models.items():
    print(f"\n⚙️ Training {name} on small_train…")
    pipe = Pipeline(stages=[label_indexer] + cat_indexers + [assembler, clf])
    model = pipe.fit(small_train)
    preds = model.transform(small_val)
    f1 = f1_eval.evaluate(preds)
    acc = acc_eval.evaluate(preds)
    print(f"   → {name}: F1={f1:.4f}, Acc={acc:.4f}")
    path = f"models/us_accidents_{name.lower()}_small"
    model.write().overwrite().save(path)
    print(f"   💾 saved → {path}")




⚙️ Training LogisticRegression on small_train…
   → LogisticRegression: F1=0.7862, Acc=0.8531
   💾 saved → models/us_accidents_logisticregression_small

⚙️ Training DecisionTree on small_train…
   → DecisionTree: F1=0.7979, Acc=0.8549
   💾 saved → models/us_accidents_decisiontree_small

⚙️ Training RandomForest on small_train…
   → RandomForest: F1=0.7860, Acc=0.8535
   💾 saved → models/us_accidents_randomforest_small


In [10]:
# %% [python]
# Gerekli ek kütüphaneleri import edin (önceki imports geçerli)
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, accuracy_score
import joblib

# ÖNEMLİ: feature_cols_for_assembler ve assembler'ın doğru tanımlandığından emin olun.
# Bu değişkenlerin bu hücreden önce tanımlanmış olması gerekmektedir.
# Örneğin:
# numerical_cols_for_assembler = ["Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
#                                 "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
#                                 "Junction", "Duration"]
# categorical_cols = ["Weather_Condition", "Wind_Direction", "Civil_Twilight", "Sunrise_Sunset",
#                     "State", "City_Cleaned", "Street_Cleaned"]
# feature_cols_for_assembler = numerical_cols_for_assembler + [c + "_Idx" for c in categorical_cols]
# assembler = VectorAssembler(inputCols=feature_cols_for_assembler, outputCol="features", handleInvalid="skip")
# cat_indexers = [StringIndexer(inputCol=c, outputCol=c+"_Idx", handleInvalid="keep") for c in categorical_cols]
# feature_pipeline_for_pandas = Pipeline(stages=cat_indexers + [assembler])
# feature_pipeline_model_for_pandas = feature_pipeline_for_pandas.fit(small_train) # small_train bu noktada tanımlı olmalı

# small_train ve small_val'ı dönüştür ve Pandas DataFrame'e çek
print("\n🔄 Spark DataFrame'leri Pandas'a dönüştürülüyor...")
pd_sample_transformed = feature_pipeline_model_for_pandas.transform(small_train)
pd_sample = pd_sample_transformed.orderBy(rand()).limit(50000).toPandas()

# 'features' vektör sütununu ve orijinal 'Severity' hedef sütununu al
X = pd.DataFrame(pd_sample['features'].apply(lambda x: x.toArray()).tolist())
# !!! DÜZELTME: Severity değerlerini 0'dan başlayacak şekilde ayarla
y = (pd_sample["Severity"] - 1).astype(int) # Severity'den 1 çıkarıp 0,1,2,3 yapıyoruz

pd_val_transformed = feature_pipeline_model_for_pandas.transform(small_val)
pd_val = pd_val_transformed.orderBy(rand()).limit(10000).toPandas()

X_val = pd.DataFrame(pd_val['features'].apply(lambda x: x.toArray()).tolist())
# !!! DÜZELTME: Severity değerlerini 0'dan başlayacak şekilde ayarla
y_val = (pd_val["Severity"] - 1).astype(int) # Severity'den 1 çıkarıp 0,1,2,3 yapıyoruz

print(f"✅ Pandas DataFrame'leri hazır.")
print(f"   X_train shape (Pandas): {X.shape}, y_train shape (Pandas): {y.shape}")
print(f"   X_val shape (Pandas): {X_val.shape}, y_val shape (Pandas): {y_val.shape}")

# Model tanımlamaları (Çoklu Sınıflandırma için ayarlanmış parametreler)
# num_severity_classes doğru olmalı, çünkü y'yi güncelledik.
num_severity_classes = y.nunique() # unique sınıf sayısını dinamik olarak bulalım

sk_models = {
    "XGBoost": XGBClassifier(
        objective='multi:softmax', num_class=num_severity_classes, eval_metric='mlogloss', use_label_encoder=False, random_state=42
    ),
    "LightGBM": LGBMClassifier(
        objective='multiclass', num_class=num_severity_classes, metric='multi_logloss', random_state=42
    ),
    "CatBoost": CatBoostClassifier(
        iterations=100, learning_rate=0.1, depth=6,
        loss_function='MultiClass',
        verbose=0, random_seed=42
    )
}

results_sklearn = []
for name, clf in sk_models.items():
    print(f"\n⚙️ **{name}** modeli eğitimi (küçük Pandas veri)...")
    clf.fit(X, y)
    preds = clf.predict(X_val)
    
    # F1 ve Accuracy hesaplama (çoklu sınıflandırma için 'weighted' average)
    f1 = f1_score(y_val, preds, average='weighted')
    acc = accuracy_score(y_val, preds)
    
    print(f"   → **{name}**: F1={f1:.4f}, Acc={acc:.4f}")
    
    # Modeli kaydet
    path = f"models/us_accidents_{name.lower()}_small.pkl"
    joblib.dump(clf, path)
    print(f"   💾 Model kaydedildi → {path}")
    results_sklearn.append((name, f1, acc))

# ---
### Sonuçların Karşılaştırılması
print("\n--- Scikit-learn (Pandas) Modellerinin Sonuçları ---")
for res in results_sklearn:
    print(f"Model: {res[0]}, F1: {res[1]:.4f}, Acc: {res[2]:.4f}")


🔄 Spark DataFrame'leri Pandas'a dönüştürülüyor...
✅ Pandas DataFrame'leri hazır.
   X_train shape (Pandas): (50000, 17), y_train shape (Pandas): (50000,)
   X_val shape (Pandas): (10000, 17), y_val shape (Pandas): (10000,)

⚙️ **XGBoost** modeli eğitimi (küçük Pandas veri)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   → **XGBoost**: F1=0.8356, Acc=0.8689
   💾 Model kaydedildi → models/us_accidents_xgboost_small.pkl

⚙️ **LightGBM** modeli eğitimi (küçük Pandas veri)...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1285
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17
[LightGBM] [Info] Start training from score -4.412898
[LightGBM] [Info] Start training from score -0.162072
[LightGBM] [Info] Start training from score -2.176834
[LightGBM] [Info] Start training from score -3.725543
   → **LightGBM**: F1=0.8346, Acc=0.8699
   💾 Model kaydedildi → models/us_accidents_lightgbm_small.pkl

⚙️ **CatBoost** modeli eğitimi (küçük Pandas veri)...
   → **CatBoost**: F1=0.8188, Acc=0.8667
   💾 Model kaydedildi → models/us_accidents_catboost_small.pkl

--- Scikit

In [1]:
# %% [python]
# 1) Spark ve Kütüphaneleri Başlat
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when, rand
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier,
    RandomForestClassifier # GBTClassifier'ı çoklu sınıflandırma için varsayılan olarak desteklemediği için kaldırdık
)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
import os

spark = SparkSession.builder \
    .appName("US Accidents Severity Experiments - PySpark MLlib") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .config("spark.driver.maxResultSize","2g") \
    .getOrCreate()

# %% [python]
# 2) Veri Hazırlığı
# 2.1) CSV'yi oku
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.2) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.3) Start/End → Duration hesapla
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.4) City/Street cardinality düşür
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.5) Sütunları seç (Severity'yi orijinal haliyle bırakıyoruz)
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition", "Wind_Direction", "Junction", "Duration", "Severity", # Severity orijinal haliyle kalıyor
    "Civil_Twilight", "Sunrise_Sunset", "State", "City_Cleaned", "Street_Cleaned"
]
df_selected = df3.select(*selected_cols)

# 2.6) Kategorik ve feature listeleri
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
numerical_cols_for_assembler = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction", "Duration"
]

# Assembler'a gidecek tüm feature'lar (numerik + indekslenmiş kategorik)
feature_cols_for_assembler = numerical_cols_for_assembler + [c + "_Idx" for c in categorical_cols]


# 2.7) NA at ve cache
df_no_na = df_selected.dropna().cache()
print(f"✅ df_no_na hazır: {df_no_na.count():,} kayıt, {len(df_no_na.columns)} sütun")

# %% [python]
# 3) Train/Test Bölme ve Küçük Alt Küme Oluşturma
train, test = df_no_na.randomSplit([0.8,0.2], seed=42)
print(f"🔀 Bölme → train: {train.count():,}, test: {test.count():,}")

small = train.sample(fraction=0.2, seed=42)
small_train, small_val = small.randomSplit([0.8,0.2], seed=42)
print(f"🔄 small_train: {small_train.count():,}, small_val: {small_val.count():,}")

# %% [python]
# 4) Spark ML Modelleri Tanımlama ve Küçük Veride Deneme
# Label indexer: Severity'yi 0'dan başlayan tamsayı indekslere dönüştürecek
label_indexer = StringIndexer(inputCol="Severity", outputCol="label", handleInvalid="keep")
label_col_name_for_spark_models = "label" # Hedef sütunumuz artık "label" olacak

# Kategorik özellikler için indexer'lar
cat_indexers = [StringIndexer(inputCol=c, outputCol=c+"_Idx", handleInvalid="keep")
                for c in categorical_cols]
# Tüm özellikleri tek bir vektörde birleştiren assembler
assembler = VectorAssembler(inputCols=feature_cols_for_assembler, outputCol="features", handleInvalid="skip")

# Değerlendirme metrikleri
f1_eval = MulticlassClassificationEvaluator(
    labelCol=label_col_name_for_spark_models, predictionCol="prediction", metricName="f1"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol=label_col_name_for_spark_models, predictionCol="prediction", metricName="accuracy"
)

# Spark ML Modellerinin Tanımlanması (çoklu sınıflandırma için)
models = {
    "LR": LogisticRegression(labelCol=label_col_name_for_spark_models, featuresCol="features", maxIter=20),
    "DT": DecisionTreeClassifier(labelCol=label_col_name_for_spark_models, featuresCol="features", maxBins=256),
    "RF": RandomForestClassifier(labelCol=label_col_name_for_spark_models, featuresCol="features", numTrees=50, maxBins=256)
}

# Hiperparametre ızgarası (basit örnek)
grids = {
    "LR": ParamGridBuilder().addGrid(models['LR'].regParam, [0.01,0.1]).addGrid(models['LR'].elasticNetParam, [0.0,0.5]).build(),
    "DT": ParamGridBuilder().addGrid(models['DT'].maxDepth, [5,10]).build(),
    "RF": ParamGridBuilder().addGrid(models['RF'].numTrees, [20,50]).addGrid(models['RF'].maxDepth, [5,10]).build()
}

# Her model için küçük veriyle prova (Spark ML modelleri)
results_spark_ml = []
for name, clf in models.items():
    print(f"\n⚙️ **{name}** modeli eğitimi (küçük PySpark veri)...")
    
    # Spark ML Pipeline'ı oluşturma: label_indexer, cat_indexers, assembler ve sınıflandırıcıyı içerir
    pipe_stages = [label_indexer] + cat_indexers + [assembler, clf]
    pipe = Pipeline(stages=pipe_stages)
    
    tvs = TrainValidationSplit(
        estimator=pipe,
        estimatorParamMaps=grids[name],
        evaluator=f1_eval,
        trainRatio=0.8,
        parallelism=1 # Paralelleştirmeyi sınırlayabiliriz
    )
    tvsModel = tvs.fit(small_train)
    best = tvsModel.bestModel
    
    # small_val üzerinde değerlendirme
    preds = best.transform(small_val)
    f1 = f1_eval.evaluate(preds)
    acc = acc_eval.evaluate(preds)
    
    print(f"🏆 **{name}** — small_val F1: {f1:.4f}, Acc: {acc:.4f}")
    
    # Modeli kaydet (Spark ML modellerini kaydetme yolu bu şekildedir)
    path = f"models/us_accidents_{name.lower()}_small_spark"
    best.write().overwrite().save(path)
    print(f"💾 **{name}** modeli kaydedildi → {path}")
    results_spark_ml.append((name, f1, acc))

# %% [python]
# 5) Sonuçların Karşılaştırılması ve En İyi Modelin Tüm Veride Eğitimi
print("\n--- Spark ML (PySpark) Modellerinin Küçük Verideki Sonuçları ---")
for res in results_spark_ml:
    print(f"Model: {res[0]}, F1: {res[1]:.4f}, Acc: {res[2]:.4f}")

# En iyi performansı gösteren modeli bulalım
if results_spark_ml:
    best_model_name = max(results_spark_ml, key=lambda item: item[1])[0] # F1 skoruna göre en iyisi
    print(f"\n✨ Küçük veride en iyi performans gösteren Spark ML modeli: {best_model_name}")

    # En iyi modelin orijinal (daha geniş) tanımlamasını al
    best_clf_full = models[best_model_name]

    print(f"\n⚙️ Final **{best_model_name}** eğitimi tüm `train` verisi üzerinde…")
    
    # Tüm train veri seti üzerinde eğitmek için yeni bir pipeline oluştur (parametreler aynı kalabilir)
    pipe_full_train = Pipeline(stages=[label_indexer] + cat_indexers + [assembler, best_clf_full])
    
    # Tüm train veri seti üzerinde eğit
    model_full_train = pipe_full_train.fit(train) # df_no_na yerine 'train' kullanıyoruz, test için ayırdık
    
    # Modeli kaydet
    full_path = f"models/us_accidents_{best_model_name.lower()}_full_spark"
    model_full_train.write().overwrite().save(full_path)
    print(f"💾 Final {best_model_name} modeli kaydedildi → {full_path}")

    # Son olarak, eğitilmiş modeli 'test' veri seti üzerinde değerlendir
    print(f"\n📊 Final {best_model_name} modelinin 'test' verisi üzerinde değerlendirilmesi...")
    final_preds = model_full_train.transform(test)
    final_f1 = f1_eval.evaluate(final_preds)
    final_acc = acc_eval.evaluate(final_preds)
    print(f"🎉 Final {best_model_name} — Test Seti F1: {final_f1:.4f}, Acc: {final_acc:.4f}")
else:
    print("\n⚠️ Spark ML modelleri için sonuç bulunamadı.")

# Spark Session'ı durdur
spark.stop()
print("\nSpark Session durduruldu.")

✅ df_no_na hazır: 5,217,919 kayıt, 18 sütun
🔀 Bölme → train: 4,175,117, test: 1,042,802
🔄 small_train: 668,070, small_val: 167,401

⚙️ **LR** modeli eğitimi (küçük PySpark veri)...
🏆 **LR** — small_val F1: 0.7862, Acc: 0.8532
💾 **LR** modeli kaydedildi → models/us_accidents_lr_small_spark

⚙️ **DT** modeli eğitimi (küçük PySpark veri)...
🏆 **DT** — small_val F1: 0.8246, Acc: 0.8631
💾 **DT** modeli kaydedildi → models/us_accidents_dt_small_spark

⚙️ **RF** modeli eğitimi (küçük PySpark veri)...
🏆 **RF** — small_val F1: 0.8104, Acc: 0.8609
💾 **RF** modeli kaydedildi → models/us_accidents_rf_small_spark

--- Spark ML (PySpark) Modellerinin Küçük Verideki Sonuçları ---
Model: LR, F1: 0.7862, Acc: 0.8532
Model: DT, F1: 0.8246, Acc: 0.8631
Model: RF, F1: 0.8104, Acc: 0.8609

✨ Küçük veride en iyi performans gösteren Spark ML modeli: DT

⚙️ Final **DT** eğitimi tüm `train` verisi üzerinde…
💾 Final DT modeli kaydedildi → models/us_accidents_dt_full_spark

📊 Final DT modelinin 'test' verisi üze

In [1]:
# %% [python]
# 1) Spark ve Kütüphaneleri Başlat (Aynı kalır)
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when, rand,
    hour, dayofweek, month, year # Yeni tarih/saat fonksiyonları
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier,
    RandomForestClassifier, GBTClassifier # GBTClassifier'ı OneVsRest ile deneyeceğiz
)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator # CrossValidator eklendi
from pyspark.ml.classification import OneVsRest # OneVsRest eklendi
import os

spark = SparkSession.builder \
    .appName("US Accidents Severity Experiments - Improved") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .config("spark.driver.maxResultSize","2g") \
    .getOrCreate()

# %% [python]
# 2) Veri Hazırlığı (Yeni Özellik Mühendisliği Dahil)
# 2.1) CSV'yi oku
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.2) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.3) Start/End → Duration hesapla ve Yeni Tarih/Saat Özellikleri Oluştur
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    # Yeni tarih/saat özellikleri
    .withColumn("HourOfDay", hour(col("Start_TS")))
    .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
    .withColumn("Month", month(col("Start_TS")))
    .withColumn("Year", year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.4) City/Street cardinality düşür
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.5) Sütunları seç (Yeni özellikler dahil)
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition", "Wind_Direction", "Junction", "Duration", "Severity",
    "Civil_Twilight", "Sunrise_Sunset", "State", "City_Cleaned", "Street_Cleaned",
    "HourOfDay", "DayOfWeek", "Month", "Year" # Yeni eklenen özellikler
]
df_selected = df3.select(*selected_cols)

# 2.6) Kategorik ve feature listeleri (Yeni özellikler dahil)
# Unutmayın: Spark ML StringIndexer'lar sadece string veya numerik (integer, double) sütunları alır.
# 'HourOfDay', 'DayOfWeek', 'Month', 'Year' gibi sütunlar numerik olarak zaten uygun.
# Ancak bu sütunların kategorik olarak muamele görmesini istiyorsanız, onları da StringIndexer'dan geçirmelisiniz.
# Bu örnekte, onları numerik olarak bırakıp doğrudan assembler'a vereceğim.
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]

numerical_cols_for_assembler = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction", "Duration", # Mevcut numerik özellikler
    "HourOfDay", "DayOfWeek", "Month", "Year" # Yeni numerik özellikler
]

feature_cols_for_assembler = numerical_cols_for_assembler + [c + "_Idx" for c in categorical_cols]


# 2.7) NA at ve cache
df_no_na = df_selected.dropna().cache()
print(f"✅ df_no_na hazır: {df_no_na.count():,} kayıt, {len(df_no_na.columns)} sütun")

# %% [python]
# 3) Train/Test Bölme ve Küçük Alt Küme Oluşturma (Aynı kalır)
train, test = df_no_na.randomSplit([0.8,0.2], seed=42)
print(f"🔀 Bölme → train: {train.count():,}, test: {test.count():,}")

small = train.sample(fraction=0.2, seed=42)
small_train, small_val = small.randomSplit([0.8,0.2], seed=42)
print(f"🔄 small_train: {small_train.count():,}, small_val: {small_val.count():,}")

# %% [python]
# 4) Spark ML Modelleri Tanımlama ve Küçük Veride Deneme (Genişletilmiş Hiperparametreler)
# Label indexer: Severity'yi 0'dan başlayan tamsayı indekslere dönüştürecek
label_indexer = StringIndexer(inputCol="Severity", outputCol="label", handleInvalid="keep")
label_col_name_for_spark_models = "label"

# Kategorik özellikler için indexer'lar
cat_indexers = [StringIndexer(inputCol=c, outputCol=c+"_Idx", handleInvalid="keep")
                for c in categorical_cols]
# Tüm özellikleri tek bir vektörde birleştiren assembler
assembler = VectorAssembler(inputCols=feature_cols_for_assembler, outputCol="features", handleInvalid="skip")

# Değerlendirme metrikleri
f1_eval = MulticlassClassificationEvaluator(
    labelCol=label_col_name_for_spark_models, predictionCol="prediction", metricName="f1"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol=label_col_name_for_spark_models, predictionCol="prediction", metricName="accuracy"
)

✅ df_no_na hazır: 5,217,919 kayıt, 22 sütun
🔀 Bölme → train: 4,175,117, test: 1,042,802
🔄 small_train: 668,070, small_val: 167,401


In [3]:
# Spark ML Modellerinin Tanımlanması
# GBTClassifier'ı OneVsRest ile çoklu sınıflandırma için ekleyelim
gbt = GBTClassifier(labelCol=label_col_name_for_spark_models, featuresCol="features", maxIter=20, maxBins=256)
ovr = OneVsRest(classifier=gbt, labelCol=label_col_name_for_spark_models, featuresCol="features")

models = {
    "LR": LogisticRegression(labelCol=label_col_name_for_spark_models, featuresCol="features", maxIter=50), # MaxIter artırıldı
    "DT": DecisionTreeClassifier(labelCol=label_col_name_for_spark_models, featuresCol="features", maxBins=256),
    "RF": RandomForestClassifier(labelCol=label_col_name_for_spark_models, featuresCol="features", numTrees=100, maxBins=256), # numTrees artırıldı
}

# Hiperparametre ızgarası (Daha geniş ve fazla kombinasyonlar)
grids = {
    "LR": ParamGridBuilder() \
        .addGrid(models['LR'].regParam, [0.001, 0.01, 0.1]) \
        .addGrid(models['LR'].elasticNetParam, [0.0, 0.5, 1.0]) \
        .build(),
    "DT": ParamGridBuilder() \
        .addGrid(models['DT'].maxDepth, [5, 10, 15]) \
        .addGrid(models['DT'].minInstancesPerNode, [1, 5]) \
        .build(),
    "RF": ParamGridBuilder() \
        .addGrid(models['RF'].numTrees, [50, 100, 150]) \
        .addGrid(models['RF'].maxDepth, [10, 15, 20]) \
        .build()
}

# Her model için küçük veriyle prova (CrossValidator kullanıyoruz)
results_spark_ml = []
for name, clf in models.items():
    print(f"\n⚙️ **{name}** modeli eğitimi (küçük PySpark veri, CrossValidation)...")
    
    pipe_stages = [label_indexer] + cat_indexers + [assembler, clf]
    pipe = Pipeline(stages=pipe_stages)
    
    # CrossValidator kullanımı: numFolds=3 veya 5 genellikle iyidir
    cv = CrossValidator(
        estimator=pipe,
        estimatorParamMaps=grids[name],
        evaluator=f1_eval, # Değerlendirme için F1 skoru
        numFolds=3, # 3 katlı çapraz doğrulama
        parallelism=1, # Spark 3.0+ ile paralel eğitim. CPU çekirdek sayınıza göre artırılabilir
        seed=42
    )
    cvModel = cv.fit(small_train)
    best = cvModel.bestModel
    
    # small_val üzerinde değerlendirme
    preds = best.transform(small_val)
    f1 = f1_eval.evaluate(preds)
    acc = acc_eval.evaluate(preds)
    
    print(f"🏆 **{name}** — small_val F1: {f1:.4f}, Acc: {acc:.4f}")
    
    # Modeli kaydet
    path = f"models/us_accidents_{name.lower()}_optimized_small_spark"
    best.write().overwrite().save(path)
    print(f"💾 **{name}** modeli kaydedildi → {path}")
    results_spark_ml.append((name, f1, acc))

# %% [python]
# 5) Sonuçların Karşılaştırılması ve En İyi Modelin Tüm Veride Eğitimi
print("\n--- Spark ML (PySpark) Modellerinin Küçük Verideki Sonuçları ---")
for res in results_spark_ml:
    print(f"Model: {res[0]}, F1: {res[1]:.4f}, Acc: {res[2]:.4f}")

# En iyi performansı gösteren modeli bulalım
if results_spark_ml:
    best_model_name = max(results_spark_ml, key=lambda item: item[1])[0] # F1 skoruna göre en iyisi
    print(f"\n✨ Küçük veride en iyi performans gösteren Spark ML modeli: {best_model_name}")

    # En iyi modelin orijinal (daha geniş) tanımlamasını al
    best_clf_full = models[best_model_name]

    print(f"\n⚙️ Final **{best_model_name}** eğitimi tüm `train` verisi üzerinde…")
    
    # Tüm train veri seti üzerinde eğitmek için yeni bir pipeline oluştur (parametreler aynı kalabilir)
    pipe_full_train = Pipeline(stages=[label_indexer] + cat_indexers + [assembler, best_clf_full])
    
    # Tüm train veri seti üzerinde eğit
    model_full_train = pipe_full_train.fit(train) # df_no_na yerine 'train' kullanıyoruz, test için ayırdık
    
    # Modeli kaydet
    full_path = f"models/us_accidents_{best_model_name.lower()}_optimized_full_spark"
    model_full_train.write().overwrite().save(full_path)
    print(f"💾 Final {best_model_name} modeli kaydedildi → {full_path}")

    # Son olarak, eğitilmiş modeli 'test' veri seti üzerinde değerlendir
    print(f"\n📊 Final {best_model_name} modelinin 'test' verisi üzerinde değerlendirilmesi...")
    final_preds = model_full_train.transform(test)
    final_f1 = f1_eval.evaluate(final_preds)
    final_acc = acc_eval.evaluate(final_preds)
    print(f"🎉 Final {best_model_name} — Test Seti F1: {final_f1:.4f}, Acc: {final_acc:.4f}")
else:
    print("\n⚠️ Spark ML modelleri için sonuç bulunamadı.")

# Spark Session'ı durdur
spark.stop()
print("\nSpark Session durduruldu.")


⚙️ **LR** modeli eğitimi (küçük PySpark veri, CrossValidation)...
🏆 **LR** — small_val F1: 0.7944, Acc: 0.8519
💾 **LR** modeli kaydedildi → models/us_accidents_lr_optimized_small_spark

⚙️ **DT** modeli eğitimi (küçük PySpark veri, CrossValidation)...
🏆 **DT** — small_val F1: 0.8515, Acc: 0.8715
💾 **DT** modeli kaydedildi → models/us_accidents_dt_optimized_small_spark

⚙️ **RF** modeli eğitimi (küçük PySpark veri, CrossValidation)...


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aslay\AppData\Local\Programs\Python\Python312\Lib\socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] Varolan bir bağlantı uzaktaki bir ana bilgisayar tarafından zorla kapatıldı

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j

ConnectionRefusedError: [WinError 10061] Hedef makine etkin olarak reddettiğinden bağlantı kurulamadı

In [1]:
# %% [python]
# 1) Spark ve Kütüphaneleri Başlat
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when, rand,
    hour, dayofweek, month, year
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import os

# Spark Session'ı başlat ve bellek ayarlarını artır
spark = SparkSession.builder \
    .appName("US Accidents RF Experiment - PySpark MLlib") \
    .master("local[*]") \
    .config("spark.driver.memory","16g") \
    .config("spark.driver.maxResultSize","4g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

# %% [python]
# 2) Veri Hazırlığı
# 2.1) CSV'yi oku
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.2) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.3) Start/End → Duration hesapla ve Yeni Tarih/Saat Özellikleri Oluştur
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    # Yeni tarih/saat özellikleri
    .withColumn("HourOfDay", hour(col("Start_TS")))
    .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
    .withColumn("Month", month(col("Start_TS")))
    .withColumn("Year", year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.4) City/Street cardinality düşür
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.5) Sütunları seç
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition", "Wind_Direction", "Junction", "Duration", "Severity",
    "Civil_Twilight", "Sunrise_Sunset", "State", "City_Cleaned", "Street_Cleaned",
    "HourOfDay", "DayOfWeek", "Month", "Year"
]
df_selected = df3.select(*selected_cols)

# 2.6) Kategorik ve feature listeleri
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
numerical_cols_for_assembler = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction", "Duration",
    "HourOfDay", "DayOfWeek", "Month", "Year"
]

feature_cols_for_assembler = numerical_cols_for_assembler + [c + "_Idx" for c in categorical_cols]

# 2.7) NA at ve cache
df_no_na = df_selected.dropna().cache()
print(f"✅ df_no_na hazır: {df_no_na.count():,} kayıt, {len(df_no_na.columns)} sütun")

# %% [python]
# 3) Train/Test Bölme ve Küçük Alt Küme Oluşturma
train, test = df_no_na.randomSplit([0.8,0.2], seed=42)
print(f"🔀 Bölme → train: {train.count():,}, test: {test.count():,}")

# Küçük alt küme (daha küçük bir örneklem deneyebiliriz)
small = train.sample(fraction=0.1, seed=42) # Örneklem oranını düşürdük
small_train, small_val = small.randomSplit([0.8,0.2], seed=42)
print(f"🔄 small_train: {small_train.count():,}, small_val: {small_val.count():,}")

# %% [python]
# 4) RandomForestClassifier Model Tanımlama ve Küçük Veride Deneme

# Label indexer: Severity'yi 0'dan başlayan tamsayı indekslere dönüştürecek
label_indexer = StringIndexer(inputCol="Severity", outputCol="label", handleInvalid="keep")
label_col_name_for_spark_models = "label"

# Kategorik özellikler için indexer'lar
cat_indexers = [StringIndexer(inputCol=c, outputCol=c+"_Idx", handleInvalid="keep")
                for c in categorical_cols]
# Tüm özellikleri tek bir vektörde birleştiren assembler
assembler = VectorAssembler(inputCols=feature_cols_for_assembler, outputCol="features", handleInvalid="skip")

# Değerlendirme metrikleri
f1_eval = MulticlassClassificationEvaluator(
    labelCol=label_col_name_for_spark_models, predictionCol="prediction", metricName="f1"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol=label_col_name_for_spark_models, predictionCol="prediction", metricName="accuracy"
)

# RandomForestClassifier Modelinin Tanımlanması
rf_classifier = RandomForestClassifier(
    labelCol=label_col_name_for_spark_models,
    featuresCol="features",
    maxBins=256
)

# Hiperparametre ızgarası (RandomForest için ayarlanmış)
# numTrees ve maxDepth değerlerini daha düşük aralıklarla deniyoruz
grids_rf = ParamGridBuilder() \
    .addGrid(rf_classifier.numTrees, [50, 75]) \
    .addGrid(rf_classifier.maxDepth, [10, 15]) \
    .build()

print(f"\n⚙️ **RandomForestClassifier** modeli eğitimi (küçük PySpark veri, CrossValidation)...")

# Pipeline oluşturma
pipe_rf = Pipeline(stages=[label_indexer] + cat_indexers + [assembler, rf_classifier])

# CrossValidator kullanımı
cv_rf = CrossValidator(
    estimator=pipe_rf,
    estimatorParamMaps=grids_rf,
    evaluator=f1_eval,
    numFolds=3, # 3 katlı çapraz doğrulama
    parallelism=1,
    seed=42
)
cvModel_rf = cv_rf.fit(small_train)
best_rf_model = cvModel_rf.bestModel

# small_val üzerinde değerlendirme
preds_rf = best_rf_model.transform(small_val)
f1_rf = f1_eval.evaluate(preds_rf)
acc_rf = acc_eval.evaluate(preds_rf)

print(f"🏆 **RandomForestClassifier** — small_val F1: {f1_rf:.4f}, Acc: {acc_rf:.4f}")

# Modeli kaydet
path_rf = f"models/us_accidents_rf_optimized_small_spark"
best_rf_model.write().overwrite().save(path_rf)
print(f"💾 **RandomForestClassifier** modeli kaydedildi → {path_rf}")


# %% [python]
# 5) RandomForestClassifier için Sonuçları Görüntüleme ve Tüm Veride Eğitme
print("\n--- RandomForestClassifier Modelinin Küçük Verideki Sonuçları ---")
print(f"Model: RandomForestClassifier, F1: {f1_rf:.4f}, Acc: {acc_rf:.4f}")

print(f"\n⚙️ Final **RandomForestClassifier** eğitimi tüm `train` verisi üzerinde…")

# En iyi modelin orijinal tanımlamasını al (CrossValidator'dan gelen en iyi parametrelerle)
# cvModel_rf.bestModel zaten en iyi parametrelerle eğitilmiş pipeline'dır.
# Bu pipeline'ı doğrudan tüm train verisi üzerinde eğitebiliriz.
# Ancak, RandomForestClassifier'ın kendisini alıp yeni bir pipeline oluşturmak daha temiz olabilir.
# best_rf_model.stages[-1] bize RandomForestClassifier modelini verir.
# Yeni bir RandomForestClassifier nesnesi oluşturup en iyi parametreleri set edebiliriz.
# Veya daha basiti, best_rf_model'ın kendisi zaten bir PipelineModel olduğu için,
# onu doğrudan yeni bir Pipeline'a koymak yerine, onun estimator'ını alıp yeni bir Pipeline oluşturabiliriz.

# En iyi RandomForestClassifier estimator'ını almak için:
# best_rf_estimator = best_rf_model.stages[-1] # Bu, eğitilmiş bir modeldir (RandomForestClassificationModel)

# Aslında, CrossValidator'ın fit ettiği `bestModel` zaten bir `PipelineModel`'dır.
# Bu `PipelineModel`'ın son aşaması, en iyi parametrelerle eğitilmiş `RandomForestClassificationModel`'dır.
# `PipelineModel`'ı doğrudan `train` üzerinde tekrar `fit` edemeyiz.
# Yapmamız gereken, `best_rf_model`'ın içindeki `RandomForestClassifier`'ın en iyi parametrelerini alıp,
# yeni bir `RandomForestClassifier` (Estimator) oluşturmak ve onu tüm `train` üzerinde eğitmektir.

# En iyi RF parametrelerini al
best_rf_params = best_rf_model.stages[-1].extractParamMap()

# Yeni bir RandomForestClassifier estimator'ı oluştur ve en iyi parametreleri ata
final_rf_estimator = RandomForestClassifier(
    labelCol=label_col_name_for_spark_models,
    featuresCol="features",
    maxBins=256
)
final_rf_estimator.setParams(**best_rf_params) # En iyi parametreleri set et

# Tüm train veri seti üzerinde eğitmek için yeni bir pipeline oluştur
pipe_full_train_rf = Pipeline(stages=[label_indexer] + cat_indexers + [assembler, final_rf_estimator])

# Tüm train veri seti üzerinde eğit
model_full_train_rf = pipe_full_train_rf.fit(train)

# Modeli kaydet
full_path_rf = "models/us_accidents_rf_optimized_full_spark"
model_full_train_rf.write().overwrite().save(full_path_rf)
print(f"💾 Final RandomForestClassifier modeli kaydedildi → {full_path_rf}")

# Son olarak, eğitilmiş modeli 'test' veri seti üzerinde değerlendir
print(f"\n📊 Final RandomForestClassifier modelinin 'test' verisi üzerinde değerlendirilmesi...")
final_preds_rf = model_full_train_rf.transform(test)
final_f1_rf = f1_eval.evaluate(final_preds_rf)
final_acc_rf = acc_eval.evaluate(final_preds_rf)
print(f"🎉 Final RandomForestClassifier — Test Seti F1: {final_f1_rf:.4f}, Acc: {final_acc_rf:.4f}")

# Spark Session'ı durdur
spark.stop()
print("\nSpark Session durduruldu.")


✅ df_no_na hazır: 5,217,919 kayıt, 22 sütun
🔀 Bölme → train: 4,175,117, test: 1,042,802
🔄 small_train: 334,092, small_val: 83,858

⚙️ **RandomForestClassifier** modeli eğitimi (küçük PySpark veri, CrossValidation)...
🏆 **RandomForestClassifier** — small_val F1: 0.8401, Acc: 0.8737
💾 **RandomForestClassifier** modeli kaydedildi → models/us_accidents_rf_optimized_small_spark

--- RandomForestClassifier Modelinin Küçük Verideki Sonuçları ---
Model: RandomForestClassifier, F1: 0.8401, Acc: 0.8737

⚙️ Final **RandomForestClassifier** eğitimi tüm `train` verisi üzerinde…


TypeError: keywords must be strings

In [None]:
# %% [python]
# 5) RandomForestClassifier için Sonuçları Görüntüleme ve Tüm Veride Eğitme
print("\n--- RandomForestClassifier Modelinin Küçük Verideki Sonuçları ---")
print(f"Model: RandomForestClassifier, F1: {f1_rf:.4f}, Acc: {acc_rf:.4f}")

print(f"\n⚙️ Final **RandomForestClassifier** eğitimi tüm `train` verisi üzerinde…")

# En iyi RF parametrelerini al
# best_rf_model.stages[-1] bize eğitilmiş RandomForestClassificationModel'ı verir.
# extractParamMap() methodu Param objeleri içeren bir dictionary döndürür.
best_rf_params = best_rf_model.stages[-1].extractParamMap()

# TypeError: keywords must be strings hatasını çözmek için:
# Param objelerini string isimlerine dönüştürüyoruz.
# Param objelerinin 'name' özelliği string ismini verir.
string_keyed_rf_params = {param.name: value for param, value in best_rf_params.items()}


# Yeni bir RandomForestClassifier estimator'ı oluştur ve en iyi parametreleri ata
final_rf_estimator = RandomForestClassifier(
    labelCol=label_col_name_for_spark_models,
    featuresCol="features",
    maxBins=256
)
# Parametreleri set ederken artık string anahtarlı dictionary kullanıyoruz.
final_rf_estimator.setParams(**string_keyed_rf_params)

# Tüm train veri seti üzerinde eğitmek için yeni bir pipeline oluştur
pipe_full_train_rf = Pipeline(stages=[label_indexer] + cat_indexers + [assembler, final_rf_estimator])

# Tüm train veri seti üzerinde eğit
model_full_train_rf = pipe_full_train_rf.fit(train)

# Modeli kaydet
full_path_rf = "models/us_accidents_rf_optimized_full_spark"
model_full_train_rf.write().overwrite().save(full_path_rf)
print(f"💾 Final RandomForestClassifier modeli kaydedildi → {full_path_rf}")

# Son olarak, eğitilmiş modeli 'test' veri seti üzerinde değerlendir
print(f"\n📊 Final RandomForestClassifier modelinin 'test' verisi üzerinde değerlendirilmesi...")
final_preds_rf = model_full_train_rf.transform(test)
final_f1_rf = f1_eval.evaluate(final_preds_rf)
final_acc_rf = acc_eval.evaluate(final_preds_rf)
print(f"🎉 Final RandomForestClassifier — Test Seti F1: {final_f1_rf:.4f}, Acc: {final_acc_rf:.4f}")

# Spark Session'ı durdur
spark.stop()
print("\nSpark Session durduruldu.")


--- RandomForestClassifier Modelinin Küçük Verideki Sonuçları ---
Model: RandomForestClassifier, F1: 0.8401, Acc: 0.8737

⚙️ Final **RandomForestClassifier** eğitimi tüm `train` verisi üzerinde…
💾 Final RandomForestClassifier modeli kaydedildi → models/us_accidents_rf_optimized_full_spark

📊 Final RandomForestClassifier modelinin 'test' verisi üzerinde değerlendirilmesi...


In [11]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [20]:
# ---------------------------------------------
# MLP on small split – full standalone cell
# ---------------------------------------------

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature     import StringIndexer, VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation    import MulticlassClassificationEvaluator
from pyspark.ml.tuning        import TrainValidationSplit, ParamGridBuilder
from pyspark.ml                import Pipeline

# assume you already have:
#   spark       = SparkSession.builder.getOrCreate()
#   df_no_na    = your cleaned-and-NA-dropped DataFrame
#   small_train, small_val = your 80/20 split of a 20% sample of train

# 1) re-define your feature / categorical lists
numeric_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)",
    "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)",
    "Wind_Chill(F)", "Traffic_Signal"
]

categorical_cols = [
    "Weather_Condition", "Wind_Direction",
    "Civil_Twilight", "Sunrise_Sunset",
    "State", "City_Cleaned", "Street_Cleaned"
]

# build feature_cols for assembler
feature_cols = numeric_cols + [c + "_Idx" for c in categorical_cols]

# 2) StringIndexers for label + categories
label_indexer = StringIndexer(
    inputCol="Severity", outputCol="label", handleInvalid="keep"
)
cat_indexers = [
    StringIndexer(inputCol=c, outputCol=c + "_Idx", handleInvalid="keep")
    for c in categorical_cols
]

# 3) Vector assembler
assembler = VectorAssembler(
    inputCols=feature_cols, outputCol="features", handleInvalid="skip"
)

# 4) Layers: [input_dim, hidden1, hidden2, output_dim]
num_classes = df_no_na.select("Severity").distinct().count()
layers = [ len(feature_cols), 64, 32, num_classes ]

# 5) define the MLP
mlp = MultilayerPerceptronClassifier(
    labelCol="label",
    featuresCol="features",
    layers=layers,
    maxIter=100,
    blockSize=128,
    seed=42
)

# 6) param grid
paramGrid_mlp = (ParamGridBuilder()
    .addGrid(mlp.maxIter,  [50, 100])
    .addGrid(mlp.stepSize, [0.03, 0.1])
    .build()
)

# 7) pipeline
pipe_mlp = Pipeline(stages=[
    label_indexer,
    *cat_indexers,
    assembler,
    mlp
])

# 8) evaluator & TVS
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)

tvs_mlp = TrainValidationSplit(
    estimator          = pipe_mlp,
    estimatorParamMaps = paramGrid_mlp,
    evaluator          = evaluator,
    trainRatio         = 0.8,
    parallelism        = 1
)

# 9) fit & eval
print("⚙️ MLP-TVSplit (küçük veri) eğitimi başlıyor…")
tvsModel_mlp = tvs_mlp.fit(small_train)
best_mlp    = tvsModel_mlp.bestModel

preds_mlp = best_mlp.transform(small_val)
f1_mlp  = evaluator.evaluate(preds_mlp)
acc_mlp = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
).evaluate(preds_mlp)

print(f"🏆 MLP — small_val F1: {f1_mlp:.4f}, Acc: {acc_mlp:.4f}")

# 10) save
best_mlp.write().overwrite().save("models/us_accidents_mlp_small")
print("✅ MLP small-model kaydedildi → models/us_accidents_mlp_small")


⚙️ MLP-TVSplit (küçük veri) eğitimi başlıyor…
🏆 MLP — small_val F1: 0.7860, Acc: 0.8535
✅ MLP small-model kaydedildi → models/us_accidents_mlp_small


In [None]:
# %% [python]
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when, hour, dayofweek, month, year
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# 1) SparkSession
spark = SparkSession.builder \
    .appName("Final LR Full Train") \
    .master("local[*]") \
    .config("spark.driver.memory","16g") \
    .config("spark.driver.maxResultSize","4g") \
    .getOrCreate()

# 2) En iyi küçük-model LR stage’ını yükle
small_stage_path = "models/us_accidents_lr_optimized_small_spark/stages/09_LogisticRegression_e69b6dea7a0b"
best_lr_small    = LogisticRegressionModel.load(small_stage_path)
best_reg         = best_lr_small.getRegParam()
best_enet        = best_lr_small.getElasticNetParam()
print(f"🔍 Yüklenen küçük-model LR parametreleri → regParam={best_reg}, elasticNetParam={best_enet}")

# 3) Veri Hazırlığı (notebook’unuzdaki adımları aynen buraya yapıştırın)
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)
# → drop, Duration, date‐time eklemeleri, clean_column, selected_cols vs.
# … (aynı selected_cols, categorical_cols, numerical_cols tanımlarınız)
df_no_na = df_selected.dropna().cache()

# 4) Train/Test böl ve sayaç yazdır
train, test = df_no_na.randomSplit([0.8,0.2], seed=42)
print(f"🔀 Train/Test → {train.count():,}/{test.count():,}")

# 5) Pipeline bileşenleri
label_indexer = StringIndexer(inputCol="Severity", outputCol="label", handleInvalid="keep")
cat_indexers  = [StringIndexer(inputCol=c, outputCol=c+"_Idx", handleInvalid="keep")
                 for c in categorical_cols]
assembler      = VectorAssembler(
    inputCols=numerical_cols_for_assembler + [c+"_Idx" for c in categorical_cols],
    outputCol="features",
    handleInvalid="skip"
)

# 6) Final LogisticRegression Estimator (küçük modelden gelen parametrelerle)
lr_final = LogisticRegression(
    labelCol="label",
    featuresCol="features",
    maxIter=20,
    regParam=best_reg,
    elasticNetParam=best_enet
)

pipe_final = Pipeline(stages=[label_indexer] + cat_indexers + [assembler, lr_final])

# 7) Tam train set ile eğit
model_full_lr = pipe_final.fit(train)

# 8) Test set üzerinde değerlendir
f1_eval  = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)
acc_eval = f1_eval.copy({f1_eval.metricName: "accuracy"})

preds     = model_full_lr.transform(test)
f1_test   = f1_eval.evaluate(preds)
acc_test  = acc_eval.evaluate(preds)
print(f"🎉 Final LR — Test F1: {f1_test:.4f}, Acc: {acc_test:.4f}")

# 9) Modeli kaydet
output_path = "models/us_accidents_lr_final_full_spark"
model_full_lr.write().overwrite().save(output_path)
print(f"💾 Final LR modeli kaydedildi → {output_path}")

spark.stop()


🔍 Yüklenen küçük-model LR parametreleri → regParam=0.001, elasticNetParam=0.0
🔀 Train/Test → 4,175,117/1,042,802
🎉 Final LR — Test F1: 0.7921, Acc: 0.8505
💾 Final LR modeli kaydedildi → models/us_accidents_lr_final_full_spark


In [1]:
# %% [python]
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (
    DecisionTreeClassifier,
    DecisionTreeClassificationModel
)
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1) SparkSession'ı başlat (daha büyük heap + maxResultSize)
spark = SparkSession.builder \
    .appName("US Accidents DT Final Full Train") \
    .master("local[*]") \
    .config("spark.driver.memory",        "16g") \
    .config("spark.driver.maxResultSize", "8g") \
    .getOrCreate()

# 2) Veri Hazırlığı
# 2.1) CSV oku
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.2) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.3) Zaman sütunlarını kullanarak Duration ve ek tarih/saat özellikleri oluştur
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .withColumn("HourOfDay", hour(col("Start_TS")))
    .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
    .withColumn("Month",     month(col("Start_TS")))
    .withColumn("Year",      year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.4) City/Street cardinality düşür
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.5) Modelde kullanacağımız sütunları seç
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition", "Wind_Direction", "Junction", "Duration", "Severity",
    "Civil_Twilight", "Sunrise_Sunset", "State",
    "City_Cleaned", "Street_Cleaned",
    "HourOfDay", "DayOfWeek", "Month", "Year"
]
df_selected = df3.select(*selected_cols)

# 2.6) Kategorik ve sayısal sütun listelerini hazırla
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
numerical_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction","Duration","HourOfDay","DayOfWeek","Month","Year"
]
feature_cols = numerical_cols + [c + "_Idx" for c in categorical_cols]

# 2.7) Eksikleri at ve cache
df_no_na = df_selected.dropna().cache()
print(f"✅ Hazır veri: {df_no_na.count():,} satır, {len(df_no_na.columns)} sütun")

# 3) Train/Test bölme
train, test = df_no_na.randomSplit([0.8,0.2], seed=42)
print(f"🔀 Bölme → train: {train.count():,}, test: {test.count():,}")



✅ Hazır veri: 5,217,919 satır, 22 sütun
🔀 Bölme → train: 4,175,117, test: 1,042,802


In [2]:
# 4) Küçük-modelden en iyi DecisionTree parametrelerini yükle
small_dt_stage = (
    "models/us_accidents_dt_optimized_small_spark"
    "/stages/09_DecisionTreeClassifier_f9b588c68bd3"
)
dt_small = DecisionTreeClassificationModel.load(small_dt_stage)
best_depth = dt_small.getMaxDepth()
# small-model'de de 32 kullanılmıştı; burada de 32'yi sabitliyoruz:
best_bins  = 256
print(f"🔍 Küçük modelden gelen parametreler → maxDepth={best_depth}, maxBins={best_bins}")

# 5) Pipeline bileşenleri ve final DecisionTree tanımı
label_indexer = StringIndexer(
    inputCol="Severity", outputCol="label", handleInvalid="keep"
)
cat_indexers = [
    StringIndexer(inputCol=c, outputCol=c + "_Idx", handleInvalid="keep")
    for c in categorical_cols
]
assembler = VectorAssembler(
    inputCols=feature_cols, outputCol="features", handleInvalid="skip"
)
dt_final = DecisionTreeClassifier(
    labelCol="label", featuresCol="features",
    maxDepth=best_depth, maxBins=best_bins
)

pipe_full = Pipeline(stages=[
    label_indexer
] + cat_indexers + [
    assembler,
    dt_final
])

# 6) Final modeli tüm train üzerinde eğit
print("⚙️ Final DT eğitimi tüm train üzerinde başlıyor…")
model_full = pipe_full.fit(train)

# 7) Test üzerinde değerlendir
f1_eval  = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
preds = model_full.transform(test)
print(f"🎉 Final DT — Test F1: {f1_eval.evaluate(preds):.4f}, "
      f"Acc: {acc_eval.evaluate(preds):.4f}")

# 8) Modeli kaydet
out_path = "models/us_accidents_dt_final_full_spark"
model_full.write().overwrite().save(out_path)
print(f"💾 Final DT modeli kaydedildi → {out_path}")

spark.stop()


🔍 Küçük modelden gelen parametreler → maxDepth=15, maxBins=256
⚙️ Final DT eğitimi tüm train üzerinde başlıyor…
🎉 Final DT — Test F1: 0.8586, Acc: 0.8781
💾 Final DT modeli kaydedildi → models/us_accidents_dt_final_full_spark


In [2]:
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1) SparkSession başlat
spark = (SparkSession.builder
         .appName("Final_DT_Train_And_Save")
         .config("spark.executor.memory","8g")
         .config("spark.driver.memory","4g")
         .getOrCreate())

# --- (Önceki adımlar: train/test, küçük-model’den depth/bins alımı) ---
small_dt_stage = (
    "models/us_accidents_dt_optimized_small_spark"
    "/stages/09_DecisionTreeClassifier_f9b588c68bd3"
)
dt_small   = DecisionTreeClassificationModel.load(small_dt_stage)
best_depth = dt_small.getMaxDepth()
best_bins  = 256

# label ve categorical indexer’lar
label_indexer = StringIndexer(
    inputCol="Severity", outputCol="label", handleInvalid="keep"
)
cat_indexers = [
    StringIndexer(inputCol=c, outputCol=c + "_Idx", handleInvalid="keep")
    for c in categorical_cols
]

# assembler & final DT
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)
dt_final = DecisionTreeClassifier(
    labelCol="label",
    featuresCol="features",
    maxDepth=best_depth,
    maxBins=best_bins
)

pipe_full = Pipeline(stages=[
    label_indexer,
    *cat_indexers,
    assembler,
    dt_final
])

# 6) Final modeli tüm train üzerinde eğit
print("⚙️ Final DT eğitimi tüm train üzerinde başlıyor…")
model_full = pipe_full.fit(train)

# — Yeni ekleme: tip kontrolü —
print(f">> model_full tipi: {type(model_full)}")
assert isinstance(model_full, PipelineModel), "model_full bir PipelineModel değil!"

# 7) Test üzerinde değerlendir
f1_eval  = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
preds = model_full.transform(test)
print(f"🎉 Final DT — Test F1: {f1_eval.evaluate(preds):.4f}, "
      f"Acc: {acc_eval.evaluate(preds):.4f}")

# 8) PipelineModel olarak kaydet
out_path = "models/us_accidents_dt_final_last_full_spark"
model_full.write().overwrite().save(out_path)
print(f"💾 Final DT modeli kaydedildi → {out_path}")

# 9) (Kullanım örneği)
# loaded = PipelineModel.load(out_path)
# new_preds = loaded.transform(new_dataframe)

spark.stop()


⚙️ Final DT eğitimi tüm train üzerinde başlıyor…
>> model_full tipi: <class 'pyspark.ml.pipeline.PipelineModel'>
🎉 Final DT — Test F1: 0.8589, Acc: 0.8781
💾 Final DT modeli kaydedildi → models/us_accidents_dt_final_last_full_spark


In [5]:
# -*- coding: utf-8 -*-
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, to_timestamp, unix_timestamp,
    hour, dayofweek
)
from pyspark.sql.types import DoubleType
from pyspark.ml import PipelineModel

# 0) Ensure Spark uses same Python interpreter (avoids worker connect errors)
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

# 1) Start SparkSession
spark = (
    SparkSession.builder
    .appName("Accident Severity Batch Predict")
    .master("local[*]")
    .config("spark.driver.memory", "8g")
    .config("spark.executor.memory", "8g")
    .config("spark.pyspark.python", sys.executable)
    .config("spark.pyspark.driver.python", sys.executable)
    .getOrCreate()
)

# 2) Load the saved PipelineModel
model_path = "models/us_accidents_dt_final_last_full_spark"
abs_path = os.path.abspath(model_path)
if not os.path.isdir(abs_path):
    raise FileNotFoundError(f"PipelineModel folder not found: {abs_path}")
print(f"[DEBUG] Loading PipelineModel from: {abs_path}")
pipeline_model = PipelineModel.load("models/us_accidents_dt_final_last_full_spark")

# 3) Define helper for cardinality reduction
def clean_column(df, col_name, top_n=32):
    top_vals = [
        row[col_name]
        for row in (
            df.groupBy(col_name)
              .count()
              .orderBy(col("count").desc())
              .limit(top_n)
              .collect()
        )
    ]
    out_col = f"{col_name}_Cleaned"
    return df.withColumn(
        out_col,
        when(col(col_name).isin(top_vals), col(col_name)).otherwise("Other")
    )

# 4) Define preprocessing function
def prepare_data(df):
    # parse timestamps & compute Duration (minutes)
    df = (
        df
        .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
        .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
        .withColumn("Duration",
            ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
            .cast(DoubleType())
        )
        .drop("Start_Time", "End_Time", "Start_TS", "End_TS")
    )
    # extract time features
    df = df.withColumn("hour", hour(col("Start_TS")))
    df = df.withColumn("weekday", dayofweek(col("Start_TS")))
    # reduce cardinality
    df = clean_column(df, "City", top_n=32)
    df = clean_column(df, "Street", top_n=32)
    # drop rows with missing numeric
    numeric_cols = [
        "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
        "Duration", "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)"
    ]
    df = df.na.drop(subset=numeric_cols)
    # fill categorical missing
    df = df.na.fill({
        "Weather_Condition": "Other",
        "Wind_Direction":     "Other",
        "Civil_Twilight":     "Other",
        "Sunrise_Sunset":     "Other",
        "State":              "Other",
        "Traffic_Signal":     False,
        "Junction":           False
    })
    # select exactly pipeline inputs
    pipeline_cols = (
        numeric_cols
        + ["Traffic_Signal", "Junction", "hour", "weekday"]
        + ["City_Cleaned", "Street_Cleaned"]
        + ["Weather_Condition", "Wind_Direction",
           "Civil_Twilight", "Sunrise_Sunset", "State"]
    )
    return df.select(*pipeline_cols)

# 5) Your new data as list of dicts
veri = [
    {
        "Start_Time": "2023-12-10 07:00:00",
        "End_Time":   "2023-12-10 07:45:00",
        "Temperature(F)": 15.0,
        "Humidity(%)":    85.0,
        "Pressure(in)":   29.5,
        "Visibility(mi)": 2.0,
        "Wind_Speed(mph)":20.0,
        "Precipitation(in)":0.4,
        "Weather_Condition":"Snow",
        "Wind_Direction": "N",
        "Civil_Twilight":"Night",
        "Sunrise_Sunset":"Night",
        "State":"IL",
        "Junction": True,
        "Traffic_Signal": False,
        "Crossing": True,
        "City":"Chicago",
        "Street":"W Adams St",
        "Wind_Chill(F)":10.0
    },
    {
        "Start_Time": "2024-03-20 18:30:00",
        "End_Time":   "2024-03-20 18:40:00",
        "Temperature(F)": 85.0,
        "Humidity(%)":    30.0,
        "Pressure(in)":   30.2,
        "Visibility(mi)": 10.0,
        "Wind_Speed(mph)":3.0,
        "Precipitation(in)":0.0,
        "Weather_Condition":"Clear",
        "Wind_Direction": "SE",
        "Civil_Twilight":"Day",
        "Sunrise_Sunset":"Day",
        "State":"TX",
        "Junction": False,
        "Traffic_Signal": True,
        "Crossing": False,
        "City":"Houston",
        "Street":"Main St",
        "Wind_Chill(F)":84.0
    },
    # add more rows as needed...
]

# 6) Create DataFrame, preprocess, predict
df_rows = spark.createDataFrame(veri)
df_prep = prepare_data(df_rows)
preds = pipeline_model.transform(df_prep)

# 7) Show desired columns
preds.select("City_Cleaned", "Street_Cleaned", "prediction").show(truncate=False)

# 8) Stop Spark
spark.stop()


[DEBUG] Loading PipelineModel from: c:\Users\aslay\Desktop\BİL 401\project\models\us_accidents_dt_final_last_full_spark


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0 in stage 3.0 (TID 3) (MSI executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:181)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 17 more


In [2]:
# %% [python]
# 1) Findspark ve SparkSession Başlat
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (
    RandomForestClassificationModel,
    RandomForestClassifier
)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import os

spark = SparkSession.builder \
    .appName("US Accidents RF Final Training") \
    .master("local[*]") \
    .config("spark.driver.memory", "16g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

# 2) Veri Okuma ve Hazırlama
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.1) İstenmeyen sütunları çıkar
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.2) Tarih/Saat'ten Duration ve bölgesel feature'lar
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy‑MM‑dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy‑MM‑dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .withColumn("HourOfDay",  hour(col("Start_TS")))
    .withColumn("DayOfWeek",  dayofweek(col("Start_TS")))
    .withColumn("Month",      month(col("Start_TS")))
    .withColumn("Year",       year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.3) City/Street cardinality düşürme fonksiyonu
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.4) İstenen sütunları seç
selected_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)",
    "Traffic_Signal","Weather_Condition","Wind_Direction","Junction",
    "Duration","Severity","Civil_Twilight","Sunrise_Sunset","State",
    "City_Cleaned","Street_Cleaned","HourOfDay","DayOfWeek","Month","Year"
]
df_selected = df3.select(*selected_cols)

# 2.5) Kategorik ve sayısal liste
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
numerical_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)",
    "Traffic_Signal","Junction","Duration",
    "HourOfDay","DayOfWeek","Month","Year"
]
feature_cols = numerical_cols + [c + "_Idx" for c in categorical_cols]

df_no_na = df_selected.dropna().cache()
print(f"✅ Hazır veri: {df_no_na.count():,} kayıt")

# 3) Train/Test Bölme
train, test = df_no_na.randomSplit([0.8,0.2], seed=42)
print(f"🔀 Bölme → train: {train.count():,}, test: {test.count():,}")




✅ Hazır veri: 5,217,919 kayıt
🔀 Bölme → train: 4,175,117, test: 1,042,802


In [None]:
# %% [python]
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (
    RandomForestClassifier,
    RandomForestClassificationModel
)
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1) Spark session
spark = SparkSession.builder \
    .appName("US Accidents RF Final Training") \
    .master("local[*]") \
    .config("spark.driver.memory","16g") \
    .config("spark.driver.maxResultSize","4g") \
    .getOrCreate()

# 2) Read & prep data
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.1) Drop unneeded cols
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.2) Timestamps → Duration + time features
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .withColumn("HourOfDay",  hour(col("Start_TS")))
    .withColumn("DayOfWeek",  dayofweek(col("Start_TS")))
    .withColumn("Month",      month(col("Start_TS")))
    .withColumn("Year",       year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.3) Cardinality‑reduce City/Street
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.4) Final select + drop NA
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition", "Wind_Direction", "Junction", "Duration", "Severity",
    "Civil_Twilight", "Sunrise_Sunset", "State", "City_Cleaned", "Street_Cleaned",
    "HourOfDay", "DayOfWeek", "Month", "Year"
]
df_selected = df3.select(*selected_cols)
df_no_na = df_selected.dropna().cache()

# 3) Train/test split
train, test = df_no_na.randomSplit([0.8,0.2], seed=42)



🔍 Small‑model RF params → numTrees=75, maxDepth=<bound method _DecisionTreeParams.getMaxDepth of RandomForestClassificationModel: uid=RandomForestClassifier_328cc66430e7, numTrees=75, numClasses=5, numFeatures=21>, maxBins=<bound method _DecisionTreeParams.getMaxBins of RandomForestClassificationModel: uid=RandomForestClassifier_328cc66430e7, numTrees=75, numClasses=5, numFeatures=21>


TypeError: Invalid param value given for param "maxDepth". Could not convert <bound method _DecisionTreeParams.getMaxDepth of RandomForestClassificationModel: uid=RandomForestClassifier_328cc66430e7, numTrees=75, numClasses=5, numFeatures=21> to int

In [None]:
# 4) Load tuned small‑model RF stage
stage_path = (
    "models/us_accidents_rf_optimized_small_spark/"
    "stages/09_RandomForestClassifier_328cc66430e7"
)
rf_small = RandomForestClassificationModel.load(stage_path)

# **NOTE: getters are properties in Spark 3.x, no ()**
n_trees = rf_small.getNumTrees
m_depth = rf_small.getMaxDepth()
m_bins  = rf_small.getMaxBins()

print(f"🔍 Small‑model RF params → numTrees={n_trees}, maxDepth={m_depth}, maxBins={m_bins}")

# 5) Build final‑train pipeline
categorical_cols = [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]
numerical_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction","Duration","HourOfDay","DayOfWeek","Month","Year"
]
feature_cols = numerical_cols + [c + "_Idx" for c in categorical_cols]

label_indexer = StringIndexer(
    inputCol="Severity", outputCol="label", handleInvalid="keep"
)
cat_indexers = [
    StringIndexer(inputCol=c, outputCol=c + "_Idx", handleInvalid="keep")
    for c in categorical_cols
]
assembler = VectorAssembler(
    inputCols=feature_cols, outputCol="features", handleInvalid="skip"
)


rf_final = RandomForestClassifier(
    labelCol="label", featuresCol="features",
    numTrees=n_trees, maxDepth=m_depth, maxBins=m_bins
)

pipe_full = Pipeline(stages=[
    label_indexer, *cat_indexers, assembler, rf_final
])

# 6) Fit on full train
print("⚙️ Final RF training on all data…")
model_full = pipe_full.fit(train)

# 7) Evaluate on test
f1_eval  = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
preds = model_full.transform(test)


🔍 Small‑model RF params → numTrees=75, maxDepth=15, maxBins=256
⚙️ Final RF training on all data…


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aslay\AppData\Local\Programs\Python\Python312\Lib\socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] Varolan bir bağlantı uzaktaki bir ana bilgisayar tarafından zorla kapatıldı

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\spark\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j

Py4JError: An error occurred while calling o3169.evaluate

In [18]:
spark.stop()
# %% [python]
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (
    RandomForestClassifier,
    RandomForestClassificationModel
)
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1) Spark session
spark = SparkSession.builder \
    .appName("US Accidents RF Final Training") \
    .master("local[*]") \
    .config("spark.driver.memory","16g") \
    .config("spark.driver.maxResultSize","4g") \
    .getOrCreate()# --- 1) Evaluate on test set

f1 = f1_eval.evaluate(preds)
acc = acc_eval.evaluate(preds)
print(f"🎉 Final RF — Test F1: {f1:.4f}, Acc: {acc:.4f}")

# --- 2) Save the final model
out_path = "models/us_accidents_rf_final_full_spark"
model_full.write().overwrite().save(out_path)
print(f"💾 Final RF modeli kaydedildi → {out_path}")

# --- 3) Stop Spark


ConnectionRefusedError: [WinError 10061] Hedef makine etkin olarak reddettiğinden bağlantı kurulamadı

In [4]:
# %% [python]
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp,
    hour, dayofweek, month, year,
    when, monotonically_increasing_id
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from functools import reduce

# ------------------------------------------------------------------------------
# 1) SparkSession başlat
# ------------------------------------------------------------------------------
spark = SparkSession.builder \
    .appName("US Accidents Batch Predict 100 Rows") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

In [29]:
# %% [python]
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year,
    monotonically_increasing_id
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexerModel, VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline
import os

# 1) Spark başlat
spark = SparkSession.builder \
    .appName("LR Batch Predict 100 Rows (full)") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .getOrCreate()

# 2) df_no_na’yı baştan oluştur
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.1) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.2) Süre & tarih/saat özellikleri
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .withColumn("HourOfDay", hour(col("Start_TS")))
    .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
    .withColumn("Month",    month(col("Start_TS")))
    .withColumn("Year",     year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.3) City/Street cardinality düşürme
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

from pyspark.sql.functions import col

df3 = df3.withColumn("City_Cleaned", col("City")) \
         .withColumn("Street_Cleaned", col("Street"))

# 2.4) İlgili sütunları seç ve dropna
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition","Wind_Direction","Civil_Twilight","Sunrise_Sunset",
    "State","City_Cleaned","Street_Cleaned","Junction","Duration","Severity",
    "HourOfDay","DayOfWeek","Month","Year"
]
df_no_na = df3.select(*selected_cols).dropna().cache()

# 3) 100 satırlık örnek + row_id
sample100 = df_no_na.limit(1000) \
                   .withColumn("_row_id", monotonically_increasing_id())

# 4) Küçük‐model’den indexer aşamaları (00…07) yükle
base = "models/us_accidents_lr_optimized_small_spark/stages"
indexer_paths = [os.path.join(base, d) for d in sorted(os.listdir(base)) if d.startswith("0") and "StringIndexer" in d]
prep_indexers = [StringIndexerModel.load(p) for p in indexer_paths]

# 5) VectorAssembler’ı yeniden tanımla
feature_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction","Duration",
    "HourOfDay","DayOfWeek","Month","Year"
] + [c + "_Idx" for c in [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")

# 6) Son aşama olarak final LR modelini yükle
lr_stage = "models/us_accidents_lr_final_full_spark/stages/09_LogisticRegression_ea33fdabd04f"
lr_final = LogisticRegressionModel.load(lr_stage)

# 7) Pipeline’ı oluştur ve tahmin yap
pipe_full = Pipeline(stages = prep_indexers + [assembler, lr_final])
predictions = pipe_full.fit(df_no_na).transform(sample100) \
                       .select("_row_id","Severity","prediction","probability")

# 8) Sonuçları göster
predictions.show(1000, truncate=False)

spark.stop()


+-------+--------+----------+-----------------------------------------------------------------------------------------------------------+
|_row_id|Severity|prediction|probability                                                                                                |
+-------+--------+----------+-----------------------------------------------------------------------------------------------------------+
|0      |3       |1.0       |[0.048272784018670564,0.9501515670254781,0.0014535609699508748,1.0368506671203633E-4,1.8402919188492117E-5]|
|1      |3       |1.0       |[0.04940417982616065,0.9490594379549752,0.001432049193846695,8.52563833034802E-5,1.9076641713896903E-5]    |
|2      |3       |1.0       |[0.060468464507702054,0.9374907376514571,0.001890167188593261,1.272876968319723E-4,2.334295541596523E-5]   |
|3      |2       |1.0       |[0.13762110078704323,0.858517424614645,0.0029161651448025654,8.909143381787794E-4,5.439511533046569E-5]    |
|4      |2       |1.0       |[0.05

In [23]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year,
    monotonically_increasing_id
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexerModel, VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline
import os

# 1) Spark başlat
spark = SparkSession.builder \
    .appName("LR Batch Predict 100 Rows (full)") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .getOrCreate()

# 2) df_no_na’yı baştan oluştur
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

In [25]:
df.head(1)

[Row(ID='A-1', Source='Source2', Severity=3, Start_Time=datetime.datetime(2016, 2, 8, 5, 46), End_Time=datetime.datetime(2016, 2, 8, 11, 0), Start_Lat=39.865147, Start_Lng=-84.058723, End_Lat=None, End_Lng=None, Distance(mi)=0.01, Description='Right lane blocked due to accident on I-70 Eastbound at Exit 41 OH-235 State Route 4.', Street='I-70 E', City='Dayton', County='Montgomery', State='OH', Zipcode='45424', Country='US', Timezone='US/Eastern', Airport_Code='KFFO', Weather_Timestamp=datetime.datetime(2016, 2, 8, 5, 58), Temperature(F)=36.9, Wind_Chill(F)=None, Humidity(%)=91.0, Pressure(in)=29.68, Visibility(mi)=10.0, Wind_Direction='Calm', Wind_Speed(mph)=None, Precipitation(in)=0.02, Weather_Condition='Light Rain', Amenity=False, Bump=False, Crossing=False, Give_Way=False, Junction=False, No_Exit=False, Railway=False, Roundabout=False, Station=False, Stop=False, Traffic_Calming=False, Traffic_Signal=False, Turning_Loop=False, Sunrise_Sunset='Night', Civil_Twilight='Night', Nautical

In [1]:
import pandas as pd
dfa2 = pd.read_csv("US_Accidents_March23.csv")
df_read=dfa2.head(1000)
df_read
df_read.to_csv("a.csv", index=False)

In [6]:
dfa2.columns

Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [7]:
rows = [
    {
        "ID": "A-1",
        "Source": "Source2",
        "Severity": 3,
        "Start_Time": "2016-02-08 05:46:00",
        "End_Time": "2016-02-08 11:00:00",
        "Start_Lat": 39.865147,
        "Start_Lng": -84.058723,
        "End_Lat": 39.865147,
        "End_Lng": -84.058723,
        "Distance(mi)": 0.01,
        "Description": "Right lane blocked due to accident on I-70 Eastbound at Exit 41 OH-235 State Route 4.",
        "Street": "I-70 E",
        "City": "Dayton",
        "County": "Montgomery",
        "State": "OH",
        "Zipcode": "45424",
        "Country": "US",
        "Timezone": "US/Eastern",
        "Airport_Code": "KFFO",
        "Weather_Timestamp": "2016-02-08 05:58:00",
        "Temperature(F)": 36.9,
        "Wind_Chill(F)": 36.9,
        "Humidity(%)": 91.0,
        "Pressure(in)": 29.68,
        "Visibility(mi)": 10.0,
        "Wind_Direction": "Calm",
        "Wind_Speed(mph)": 0.0,
        "Precipitation(in)": 0.02,
        "Weather_Condition": "Light Rain",
        "Amenity": False,
        "Bump": False,
        "Crossing": False,
        "Give_Way": False,
        "Junction": False,
        "No_Exit": False,
        "Railway": False,
        "Roundabout": False,
        "Station": False,
        "Stop": False,
        "Traffic_Calming": False,
        "Traffic_Signal": False,
        "Turning_Loop": False,
        "Sunrise_Sunset": "Night",
        "Civil_Twilight": "Night",
        "Nautical_Twilight": "Night",
        "Astronomical_Twilight": "Night"
    }
]

import pandas as pd

df = pd.DataFrame(rows)
df.to_csv("A-1.csv", index=False)


In [14]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year,
    monotonically_increasing_id
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexerModel, VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline
import os

# 1) Spark başlat
spark = SparkSession.builder \
    .appName("LR Batch Predict 100 Rows (full)") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .getOrCreate()
rows = [{
    "Source": "Source2",
    "Severity": 3,
    "Start_Time": "2016-02-08 05:46:00",
    "End_Time":   "2016-02-08 11:00:00",
    "Start_Lat": 39.865147,
    "Start_Lng": -84.058723,
    "Distance(mi)": 0.01,
    "Description": ("Right lane blocked due to accident on I-70 Eastbound "
                    "at Exit 41 OH-235 State Route 4."),
    "Street": "I-70 E",
    "City": "Dayton",
    "County": "Montgomery",
    "State": "OH",
    "Zipcode": "45424",
    "Country": "US",
    "Timezone": "US/Eastern",
    "Airport_Code": "KFFO",
    "Weather_Timestamp": "2016-02-08 05:58:00",
    "Temperature(F)": 36.9,
    "Humidity(%)": 91.0,
    "Pressure(in)": 29.68,
    "Visibility(mi)": 10.0,
    "Wind_Direction": "Calm",
    "Precipitation(in)": 0.02,
    "Weather_Condition": "Light Rain",
    # all your boolean columns:
    "Amenity": False,
    "Bump": False,
    "Crossing": False,
    "Give_Way": False,
    "Junction": False,
    "No_Exit": False,
    "Railway": False,
    "Roadway": False,
    "Station": False,
    "Stop": False,
    "Traffic_Signal": False,
    "Turning_Loop": False,
    # twilight columns:
    "Sunrise_Sunset": "Night",
    "Civil_Twilight": "Night",
    "Nautical_Twilight": "Night",
    "Astronomical_Twilight": "Night",
}]


# 2) df_no_na’yı baştan oluştur
df = spark.read.csv("A-1.csv", header=True, inferSchema=True)

# 2.1) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.2) Süre & tarih/saat özellikleri
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .withColumn("HourOfDay", hour(col("Start_TS")))
    .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
    .withColumn("Month",    month(col("Start_TS")))
    .withColumn("Year",     year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.3) City/Street cardinality düşürme
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.4) İlgili sütunları seç ve dropna
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition","Wind_Direction","Civil_Twilight","Sunrise_Sunset",
    "State","City_Cleaned","Street_Cleaned","Junction","Duration","Severity",
    "HourOfDay","DayOfWeek","Month","Year"
]
df_no_na = df3.select(*selected_cols).dropna().cache()

# 3) 100 satırlık örnek + row_id
sample100 = df_no_na.limit(1000) \
                   .withColumn("_row_id", monotonically_increasing_id())

# 4) Küçük‐model’den indexer aşamaları (00…07) yükle
base = "models/us_accidents_dt_final_last_full_spark/stages"
indexer_paths = [os.path.join(base, d) for d in sorted(os.listdir(base)) if d.startswith("0") and "StringIndexer" in d]
prep_indexers = [StringIndexerModel.load(p) for p in indexer_paths]

# 5) VectorAssembler’ı yeniden tanımla
feature_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction","Duration",
    "HourOfDay","DayOfWeek","Month","Year"
] + [c + "_Idx" for c in [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")
from pyspark.ml.classification import DecisionTreeClassificationModel
# 6) Son aşama olarak final LR modelini yükle
lr_stage = "models/us_accidents_dt_final_last_full_spark/stages/09_DecisionTreeClassifier_03f6571d1b4a"
dt_final = DecisionTreeClassificationModel.load(lr_stage)

# 7) Pipeline’ı oluştur ve tahmin yap
pipe_full = Pipeline(stages = prep_indexers + [assembler, dt_final])
predictions = pipe_full.fit(df_no_na).transform(sample100) \
                       .select("_row_id","Severity","prediction","probability")

# 8) Sonuçları göster
predictions.show(1000, truncate=False)

spark.stop()


+-------+--------+----------+---------------------+
|_row_id|Severity|prediction|probability          |
+-------+--------+----------+---------------------+
|0      |3       |0.0       |[1.0,0.0,0.0,0.0,0.0]|
+-------+--------+----------+---------------------+



In [None]:

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year,
    monotonically_increasing_id
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexerModel, VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline
import os

# 1) Spark başlat
spark = SparkSession.builder \
    .appName("LR Batch Predict 100 Rows (full)") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .getOrCreate()
rows = [{
    "Source": "Source2",
    "Severity": 3,
    "Start_Time": "2016-02-08 05:46:00",
    "End_Time":   "2016-02-08 11:00:00",
    "Start_Lat": 39.865147,
    "Start_Lng": -84.058723,
    "Distance(mi)": 0.01,
    "Description": ("Right lane blocked due to accident on I-70 Eastbound "
                    "at Exit 41 OH-235 State Route 4."),
    "Street": "I-70 E",
    "City": "Dayton",
    "County": "Montgomery",
    "State": "OH",
    "Zipcode": "45424",
    "Country": "US",
    "Timezone": "US/Eastern",
    "Airport_Code": "KFFO",
    "Weather_Timestamp": "2016-02-08 05:58:00",
    "Temperature(F)": 36.9,
    "Humidity(%)": 91.0,
    "Pressure(in)": 29.68,
    "Visibility(mi)": 10.0,
    "Wind_Direction": "Calm",
    "Precipitation(in)": 0.02,
    "Weather_Condition": "Light Rain",
    # all your boolean columns:
    "Amenity": False,
    "Bump": False,
    "Crossing": False,
    "Give_Way": False,
    "Junction": False,
    "No_Exit": False,
    "Railway": False,
    "Roadway": False,
    "Station": False,
    "Stop": False,
    "Traffic_Signal": False,
    "Turning_Loop": False,
    # twilight columns:
    "Sunrise_Sunset": "Night",
    "Civil_Twilight": "Night",
    "Nautical_Twilight": "Night",
    "Astronomical_Twilight": "Night",
}]


# 2) df_no_na’yı baştan oluştur
df = spark.read.csv("A-1.csv", header=True, inferSchema=True)

# 2.1) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.2) Süre & tarih/saat özellikleri
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .withColumn("HourOfDay", hour(col("Start_TS")))
    .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
    .withColumn("Month",    month(col("Start_TS")))
    .withColumn("Year",     year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.3) City/Street cardinality düşürme
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.4) İlgili sütunları seç ve dropna
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition","Wind_Direction","Civil_Twilight","Sunrise_Sunset",
    "State","City_Cleaned","Street_Cleaned","Junction","Duration","Severity",
    "HourOfDay","DayOfWeek","Month","Year"
]
df_no_na = df3.select(*selected_cols).dropna().cache()

# 3) 100 satırlık örnek + row_id
sample100 = df_no_na.limit(1000) \
                   .withColumn("_row_id", monotonically_increasing_id())

# 4) Küçük‐model’den indexer aşamaları (00…07) yükle
base = "models/us_accidents_lr_optimized_small_spark/stages"
indexer_paths = [os.path.join(base, d) for d in sorted(os.listdir(base)) if d.startswith("0") and "StringIndexer" in d]
prep_indexers = [StringIndexerModel.load(p) for p in indexer_paths]

# 5) VectorAssembler’ı yeniden tanımla
feature_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction","Duration",
    "HourOfDay","DayOfWeek","Month","Year"
] + [c + "_Idx" for c in [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")

# 6) Son aşama olarak final LR modelini yükle
lr_stage = "models/us_accidents_lr_final_full_spark/stages/09_LogisticRegression_ea33fdabd04f"
lr_final = LogisticRegressionModel.load(lr_stage)

# 7) Pipeline’ı oluştur ve tahmin yap
pipe_full = Pipeline(stages = prep_indexers + [assembler, lr_final])
predictions = pipe_full.fit(df_no_na).transform(sample100) \
                       .select("_row_id","Severity","prediction","probability")

# 8) Sonuçları göster
predictions.show(1000, truncate=False)

spark.stop()


+-------+--------+----------+----------------------------------------------------------------------------------------------------------+
|_row_id|Severity|prediction|probability                                                                                               |
+-------+--------+----------+----------------------------------------------------------------------------------------------------------+
|0      |3       |1.0       |[0.12174241227344809,0.872311702384795,0.005232305851078391,6.672239728104335E-4,4.6355517867870195E-5]   |
|1      |3       |1.0       |[0.12438972230370629,0.8698682314319333,0.0051463468667118754,5.4772628593413E-4,4.797311171448235E-5]    |
|2      |3       |1.0       |[0.060468464507702054,0.9374907376514571,0.001890167188593261,1.272876968319723E-4,2.334295541596523E-5]  |
|3      |2       |1.0       |[0.13762110078704323,0.858517424614645,0.0029161651448025654,8.909143381787794E-4,5.439511533046569E-5]   |
|4      |2       |1.0       |[0.128281263

In [15]:
# %% [python]
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year,
    monotonically_increasing_id
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexerModel, VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline
import os

# 1) Spark başlat
spark = SparkSession.builder \
    .appName("LR Batch Predict 100 Rows (full)") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .getOrCreate()

# 2) df_no_na’yı baştan oluştur
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.1) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.2) Süre & tarih/saat özellikleri
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .withColumn("HourOfDay", hour(col("Start_TS")))
    .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
    .withColumn("Month",    month(col("Start_TS")))
    .withColumn("Year",     year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.3) City/Street cardinality düşürme
def clean_column(df, column_name, top_n=32):
    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

for c in ["City","Street"]:
    df3 = clean_column(df3, c, top_n=32)

# 2.4) İlgili sütunları seç ve dropna
selected_cols = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition","Wind_Direction","Civil_Twilight","Sunrise_Sunset",
    "State","City_Cleaned","Street_Cleaned","Junction","Duration","Severity",
    "HourOfDay","DayOfWeek","Month","Year"
]
df_no_na = df3.select(*selected_cols).dropna().cache()

# 3) 100 satırlık örnek + row_id
sample100 = df_no_na.limit(1000) \
                   .withColumn("_row_id", monotonically_increasing_id())

# 4) Küçük‐model’den indexer aşamaları (00…07) yükle
base = "models/us_accidents_dt_final_last_full_spark/stages"
indexer_paths = [os.path.join(base, d) for d in sorted(os.listdir(base)) if d.startswith("0") and "StringIndexer" in d]
prep_indexers = [StringIndexerModel.load(p) for p in indexer_paths]

# 5) VectorAssembler’ı yeniden tanımla
feature_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction","Duration",
    "HourOfDay","DayOfWeek","Month","Year"
] + [c + "_Idx" for c in [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned"
]]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")

# 6) Son aşama olarak final LR modelini yükle
lr_stage = "models/us_accidents_lr_final_full_spark/stages/09_LogisticRegression_ea33fdabd04f"
lr_final = LogisticRegressionModel.load(lr_stage)

# 7) Pipeline’ı oluştur ve tahmin yap
pipe_full = Pipeline(stages = prep_indexers + [assembler, lr_final])
predictions = pipe_full.fit(df_no_na).transform(sample100) \
                       .select("_row_id","Severity","prediction","probability")

# 8) Sonuçları göster
predictions.show(1000, truncate=False)

spark.stop()


+-------+--------+----------+----------------------------------------------------------------------------------------------------------+
|_row_id|Severity|prediction|probability                                                                                               |
+-------+--------+----------+----------------------------------------------------------------------------------------------------------+
|0      |3       |1.0       |[0.17874585581159066,0.7960696227204713,0.024674170962156,4.4248753374946464E-4,6.786297203255087E-5]     |
|1      |3       |1.0       |[0.18241842617852375,0.7929082762697467,0.02424033565452272,3.628132249768405E-4,7.014867222988984E-5]    |
|2      |3       |1.0       |[0.09312441311369424,0.8974016595389032,0.009349538978597766,8.854343288822121E-5,3.584493591644712E-5]   |
|3      |2       |1.0       |[0.3973740108084888,0.5609034025443987,0.038601326465150694,0.002964842136159676,1.5641804580225833E-4]   |
|4      |2       |1.0       |[0.187739597

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year,
    monotonically_increasing_id
)
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexerModel, VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline
import os
from pyspark.sql import Row
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
# 1) Spark başlat
spark = SparkSession.builder \
    .appName("LR Batch Predict Dynamic Columns") \
    .master("local[*]") \
    .config("spark.driver.memory","16g") \
    .getOrCreate()

# 2) df_no_na’yı baştan oluştur (model eğitimi için gerekli)
# Orijinal büyük veri setini okuma
# Lütfen "US_Accidents_March23.csv" dosyasının bu kodun çalıştığı dizinde olduğundan emin olun.
df = spark.read.csv("US_Accidents_March23.csv", header=True, inferSchema=True)

# 2.1) İstenmeyen sütunları düş
drop_cols = [
    "ID","Source","Zipcode","Timezone","Airport_Code","Amenity","Bump",
    "Give_Way","No_Exit","Railway","Description","County","Roundabout",
    "Station","Stop","Nautical_Twilight","Astronomical_Twilight","Country"
]
df2 = df.drop(*drop_cols)

# 2.2) Süre & tarih/saat özellikleri
df3 = (
    df2
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .withColumn("HourOfDay", hour(col("Start_TS")))
    .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
    .withColumn("Month",    month(col("Start_TS")))
    .withColumn("Year",     year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# 2.3) City/Street cardinality düşürme fonksiyonu
def clean_column(df_input, column_name, top_n=32, top_vals_cache=None):
    if top_vals_cache is None:
        print(f"'{column_name}' için top {top_n} değerler hesaplanıyor (Eğitim Verisi)...")
        top_vals = [
            r[column_name] for r in
            df_input.groupBy(column_name).count()
              .orderBy(col("count").desc())
              .limit(top_n)
              .collect()
        ]
        return df_input.withColumn(
            f"{column_name}_Cleaned",
            when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
        ), top_vals
    else:
        print(f"'{column_name}' için önbelleklenmiş top değerler kullanılıyor (Test Verisi)...")
        return df_input.withColumn(
            f"{column_name}_Cleaned",
            when(col(column_name).isin(top_vals_cache), col(column_name)).otherwise("Other")
        ), top_vals_cache

# Eğitim verisi için City/Street temizliği ve top değerleri önbellekleme
df3, city_top_vals = clean_column(df3, "City", top_n=32)
df3, street_top_vals = clean_column(df3, "Street", top_n=32)


# 2.4) İlgili sütunları seç ve dropna (Eğitim verisi için)
selected_cols_training = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition","Wind_Direction","Civil_Twilight","Sunrise_Sunset",
    "State","City_Cleaned","Street_Cleaned","Junction","Duration","Severity",
    "HourOfDay","DayOfWeek","Month","Year"
]
df_no_na = df3.select(*selected_cols_training).dropna().cache()


# --- Yeni Veri Yükleme ve Hazırlama Kısmı ---
# Örnek yeni veri oluşturma. Sütun adları Kaggle veri setindeki orijinal isimlerle eşleşiyor.
new_raw_data = [
    Row(ID="test1", Source="MapQuest", Zipcode="10001", Timezone="US/Eastern", Airport_Code="KJFK", Amenity=False, Bump=False, Give_Way=False, No_Exit=False, Railway=False, Description="Test description 1", County="New York", Roundabout=False, Station=False, Stop=False, Nautical_Twilight="Day", Astronomical_Twilight="Day", Country="US",
        Start_Time="2025-07-22 10:00:00", End_Time="2025-07-22 10:30:00",
        **{'Temperature(F)': 75.0, 'Humidity(%)': 60.0, 'Pressure(in)': 29.9, 'Visibility(mi)': 10.0,
           'Wind_Speed(mph)': 5.0, 'Precipitation(in)': 0.0, 'Wind_Chill(F)': 70.0},
        Traffic_Signal=True, Weather_Condition="Clear", Wind_Direction="NW", Civil_Twilight="Day", Sunrise_Sunset="Day",
        State="NY", City="New York", Street="Broadway", Junction=False, Severity=2,
        Start_Lat=40.7, Start_Lng=-74.0, End_Lat=40.7, End_Lng=-74.0),
    Row(ID="test2", Source="MapQuest", Zipcode="90210", Timezone="US/Pacific", Airport_Code="KLAX", Amenity=False, Bump=False, Give_Way=False, No_Exit=False, Railway=False, Description="Test description 2", County="Los Angeles", Roundabout=False, Station=False, Stop=False, Nautical_Twilight="Night", Astronomical_Twilight="Night", Country="US",
        Start_Time="2025-07-22 18:00:00", End_Time="2025-07-22 18:45:00",
        **{'Temperature(F)': 80.0, 'Humidity(%)': 50.0, 'Pressure(in)': 30.0, 'Visibility(mi)': 8.0,
           'Wind_Speed(mph)': 10.0, 'Precipitation(in)': 0.0, 'Wind_Chill(F)': 78.0},
        Traffic_Signal=False, Weather_Condition="Fair", Wind_Direction="SW", Civil_Twilight="Night", Sunrise_Sunset="Sunset",
        State="CA", City="Beverly Hills", Street="Wilshire Blvd", Junction=True, Severity=3,
        Start_Lat=34.0, Start_Lng=-118.2, End_Lat=34.0, End_Lng=-118.2),
    Row(ID="test3", Source="MapQuest", Zipcode="60601", Timezone="US/Central", Airport_Code="KORD", Amenity=True, Bump=False, Give_Way=False, No_Exit=False, Railway=False, Description="Test description 3", County="Cook", Roundabout=False, Station=True, Stop=False, Nautical_Twilight="Day", Astronomical_Twilight="Day", Country="US",
        Start_Time="2025-07-22 13:00:00", End_Time="2025-07-22 14:15:00",
        **{'Temperature(F)': 68.0, 'Humidity(%)': 70.0, 'Pressure(in)': 29.8, 'Visibility(mi)': 7.0,
           'Wind_Speed(mph)': 8.0, 'Precipitation(in)': 0.1, 'Wind_Chill(F)': 65.0},
        Traffic_Signal=True, Weather_Condition="Rain", Wind_Direction="E", Civil_Twilight="Day", Sunrise_Sunset="Day",
        State="IL", City="Chicago", Street="Michigan Ave", Junction=False, Severity=4,
        Start_Lat=41.8, Start_Lng=-87.6, End_Lat=41.8, End_Lng=-87.6)
]
new_data = spark.createDataFrame(new_raw_data)

# Yeni veriye de aynı dönüşümleri uygula (drop_cols, zaman özellikleri)
new_data_cleaned_step1 = new_data.drop(*drop_cols)
new_data_cleaned_step2 = (
    new_data_cleaned_step1
    .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
    .withColumn("Duration",
        ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
        .cast(DoubleType())
    )
    .withColumn("HourOfDay", hour(col("Start_TS")))
    .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
    .withColumn("Month",    month(col("Start_TS")))
    .withColumn("Year",     year(col("Start_TS")))
    .drop("Start_TS","End_TS","Start_Time","End_Time")
)

# Yeni test verisi için City/Street temizliği - BU SATIRLARI AKTİF HALE GETİRMELİSİNİZ!
new_data_cleaned_step2, _ = clean_column(new_data_cleaned_step2, "City", top_vals_cache=city_top_vals)
new_data_cleaned_step2, _ = clean_column(new_data_cleaned_step2, "Street", top_vals_cache=street_top_vals)


# Yeni veriden model için gerekli sütunları seç ve NaN değerleri düşür
selected_cols_prediction = [
    "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
    "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
    "Weather_Condition","Wind_Direction","Civil_Twilight","Sunrise_Sunset",
    "State","City_Cleaned","Street_Cleaned","Junction","Duration","Severity", # City_Cleaned ve Street_Cleaned burada mevcut olmalı
    "HourOfDay","DayOfWeek","Month","Year"
]

# select işlemi için doğru sütun listesini kullanın
new_data_for_prediction = new_data_cleaned_step2.select(*selected_cols_prediction).dropna() \
                                                .withColumn("_row_id", monotonically_increasing_id())



SyntaxError: invalid syntax (3359811427.py, line 98)

In [19]:
# 4) Küçük‐model’den indexer aşamaları (00…07) yükle
base = "models/us_accidents_dt_final_last_full_spark/stages"
if not os.path.exists(base):
    print(f"HATA: Model aşamalarının bulunduğu '{base}' dizini bulunamadı. Lütfen model yolunu kontrol edin.")
    spark.stop()
    exit()

indexer_paths = [os.path.join(base, d) for d in sorted(os.listdir(base)) if d.startswith("0") and "StringIndexer" in d]
prep_indexers = [StringIndexerModel.load(p) for p in indexer_paths]

# 5) VectorAssembler’ı yeniden tanımla
feature_cols = [
    "Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)",
    "Wind_Speed(mph)","Precipitation(in)","Wind_Chill(F)","Traffic_Signal",
    "Junction","Duration",
    "HourOfDay","DayOfWeek","Month","Year"
] + [c + "_Idx" for c in [
    "Weather_Condition","Wind_Direction","Civil_Twilight",
    "Sunrise_Sunset","State","City_Cleaned","Street_Cleaned" # Bu sütunlar artık mevcut olacak
]]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")

# 6) Son aşama olarak final LR modelini yükle
lr_stage = "models/us_accidents_lr_final_full_spark/stages/09_LogisticRegression_ea33fdabd04f"
if not os.path.exists(lr_stage):
    print(f"HATA: Logistic Regression modelinin bulunduğu '{lr_stage}' dizini bulunamadı. Lütfen model yolunu kontrol edin.")
    spark.stop()
    exit()
lr_final = LogisticRegressionModel.load(lr_stage)

# 7) Pipeline’ı oluştur ve tahmin yap
pipe_full = Pipeline(stages = prep_indexers + [assembler, lr_final])
print("\nPipeline eğitim verisi üzerinde fit ediliyor...")
model_fitted = pipe_full.fit(df_no_na)
print("Pipeline başarıyla fit edildi.")

print("\nYeni veri üzerinde tahminler yapılıyor...")
predictions = model_fitted.transform(new_data_for_prediction) \
                          .select("_row_id","Severity","prediction","probability")

# 8) Sonuçları göster
print("\nTahmin sonuçları:")
predictions.show(truncate=False)

spark.stop()
print("\nSpark oturumu durduruldu.")


Pipeline eğitim verisi üzerinde fit ediliyor...
Pipeline başarıyla fit edildi.

Yeni veri üzerinde tahminler yapılıyor...

Tahmin sonuçları:


Py4JJavaError: An error occurred while calling o4608.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 185.0 failed 1 times, most recent failure: Lost task 0.0 in stage 185.0 (TID 1214) (MSI executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:789)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	... 26 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4333)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4323)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4321)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4321)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3539)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:789)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	... 26 more


In [14]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, unix_timestamp, when,
    hour, dayofweek, month, year,
    monotonically_increasing_id
)
from pyspark.sql.types import DoubleType, StringType # StringType'ı da ekleyelim
from pyspark.ml.feature import StringIndexerModel, VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import PipelineModel # Pipeline'ı direkt PipelineModel olarak yükleyeceğimiz için
import os
import sys

# Ortam değişkenlerini ayarlayın (Python worker hataları için önemli)
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

# 1) Spark başlat
spark = SparkSession.builder \
    .appName("LR Batch Predict New Data") \
    .master("local[*]") \
    .config("spark.driver.memory","8g") \
    .config("spark.pyspark.python", sys.executable)\
    .config("spark.pyspark.driver.python", sys.executable)\
    .getOrCreate()

# --- Mevcut Veri Okuma ve Ön İşleme Kısmı (Modelleme için kullanılmaz, sadece pipeline'ın nasıl oluşturulduğunu gösterir) ---
# Bu kısım, PipelineModel'in nasıl eğitildiğini ve StringIndexerModel'lerin nasıl kaydedildiğini gösterir.
# Yeni veri tahmininde bu adımlara doğrudan ihtiyacımız olmayacak,
# çünkü PipelineModel bu dönüşümleri zaten içinde barındırıyor.
# Ancak, clean_column fonksiyonunu kullanmaya devam edeceğiz.

# 2.3) City/Street cardinality düşürme fonksiyonu
# Bu fonksiyon, PipelineModel'in dışındaki ham veri ön işleme adımlarının bir parçasıdır
# ve yeni veriler için de uygulanması gerekir.
def clean_column(df, column_name, top_n=32):
    """
    Belirli bir sütundaki en sık geçen 'top_n' değeri tutar, diğerlerini 'Other' yapar.
    """
    # collect() sırasında oluşabilecek hatalara karşı StringType'a dönüştürme ve null doldurma
    df = df.withColumn(column_name, col(column_name).cast(StringType())).na.fill({column_name: ""})

    top_vals = [
        r[column_name] for r in
        df.groupBy(column_name).count()
          .orderBy(col("count").desc())
          .limit(top_n)
          .collect()
    ]
    return df.withColumn(
        f"{column_name}_Cleaned",
        when(col(column_name).isin(top_vals), col(column_name)).otherwise("Other")
    )

# --- Yeni Veri İçin Hazırlık Fonksiyonu ---
def prepare_new_data(new_df, top_n=32):
    """
    Yeni gelen veriyi, modelin beklediği formata (raw features) hazırlar.
    Bu kısım, modelin kendisi tarafından yapılacak StringIndexer ve VectorAssembler
    ön işleme adımlarından önceki veriyi hazırlar.
    """
    # 2.2) Süre & tarih/saat özellikleri
    processed_df = (
        new_df
        .withColumn("Start_TS", to_timestamp(col("Start_Time"), "yyyy-MM-dd HH:mm:ss"))
        .withColumn("End_TS",   to_timestamp(col("End_Time"),   "yyyy-MM-dd HH:mm:ss"))
        .withColumn("Duration",
            ((unix_timestamp(col("End_TS")) - unix_timestamp(col("Start_TS"))) / 60)
            .cast(DoubleType())
        )
        .withColumn("HourOfDay", hour(col("Start_TS")))
        .withColumn("DayOfWeek", dayofweek(col("Start_TS")))
        .withColumn("Month",    month(col("Start_TS")))
        .withColumn("Year",     year(col("Start_TS")))
        .drop("Start_TS","End_TS","Start_Time","End_Time")
    )

    # 2.3) City/Street cardinality düşürme
    for c in ["City","Street"]:
        processed_df = clean_column(processed_df, c, top_n=top_n)

    # 2.4) İlgili sütunları seç ve dropna
    # Tahmin yapacağımız için "Severity" sütununu dahil etmiyoruz
    # Ancak eğer test verinizde Severity varsa, yine de dışarıda bırakmanız gerekir.
    selected_cols_for_prediction = [
        "Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)",
        "Wind_Speed(mph)", "Precipitation(in)", "Wind_Chill(F)", "Traffic_Signal",
        "Weather_Condition","Wind_Direction","Civil_Twilight","Sunrise_Sunset",
        "State","City_Cleaned","Street_Cleaned","Junction","Duration",
        "HourOfDay","DayOfWeek","Month","Year"
    ]
    # Sadece tahmin yapacağımız için Severity'yi düşürelim (eğer varsa)
    if "Severity" in processed_df.columns:
        processed_df = processed_df.drop("Severity")

    # Tüm gerekli sütunların var olduğundan emin olun, yoksa hata verebilir.
    # Burada yalnızca modelin beklediği sütunları seçiyoruz.
    # Eğer yeni verinizde bir sütun eksikse, burada hata alırsınız.
    # Modelin beklediği tüm sütunların yeni veride de olduğundan emin olun.
    final_df_for_pipeline = processed_df.select(*selected_cols_for_prediction).dropna()
    
    return final_df_for_pipeline


### Mevcut PipelineModel'i Yükle


# 7) Eğitilmiş PipelineModel'i yükle
# Pipeline'ın tamamının kaydedildiği ana yolu belirtin.
# Muhtemelen "models/us_accidents_lr_optimized_small_spark" veya "models/us_accidents_lr_final_full_spark"
# Kayıtlı PipelineModel'iniz Logistic Regression'ı zaten içeriyor olmalı.
# Bu örnekte, 'lr_optimized_small_spark' modelini kullanıyorum varsayımıyla devam ediyorum.
# Eğer final full spark modelini kullanıyorsanız yolu ona göre değiştirin.
saved_pipeline_model_path = "models/us_accidents_dt_final_last_full_spark" # Veya models/us_accidents_lr_final_full_spark

if not os.path.isdir(os.path.abspath(saved_pipeline_model_path)):
    print(f"HATA: Kaydedilmiş PipelineModel klasörü bulunamadı: {os.path.abspath(saved_pipeline_model_path)}")
    spark.stop()
    sys.exit(1)

print(f"[DEBUG] Kaydedilmiş PipelineModel yükleniyor: {os.path.abspath(saved_pipeline_model_path)}")
loaded_pipeline_model = PipelineModel.load(os.path.abspath(saved_pipeline_model_path))



# --- Yeni Gelecek Veriler (Örnek) ---
# Bu veri formatı, orijinal CSV dosyanızın başlık satırı ile aynı olmalıdır.
veri_yeni = [
    {
        "ID": "A-12345", "Source": "Source_A", "Severity": None, # Severity'yi None veya boş bırakın
        "Start_Time": "2024-07-22 15:30:00", "End_Time": "2024-07-22 16:00:00",
        "Start_Lat": 34.0522, "Start_Lng": -118.2437, "End_Lat": 34.0530, "End_Lng": -118.2440,
        "Distance(mi)": 0.05, "Description": "Traffic accident on Main St.",
        "Street": "Main St", "City": "Los Angeles", "County": "Los Angeles", "State": "CA",
        "Zipcode": "90012", "Country": "US", "Timezone": "US/Pacific", "Airport_Code": "KLAX",
        "Weather_Timestamp": "2024-07-22 15:00:00", "Temperature(F)": 90.0, "Wind_Chill(F)": 88.0,
        "Humidity(%)": 40.0, "Pressure(in)": 29.9, "Visibility(mi)": 10.0, "Wind_Direction": "SW",
        "Wind_Speed(mph)": 15.0, "Precipitation(in)": 0.0, "Weather_Condition": "Clear",
        "Amenity": False, "Bump": False, "Crossing": True, "Give_Way": False, "Junction": False,
        "No_Exit": False, "Railway": False, "Roundabout": False, "Station": False, "Stop": False,
        "Traffic_Calming": False, "Traffic_Signal": True, "Turning_Loop": False, "Sunrise_Sunset": "Day",
        "Civil_Twilight": "Day", "Nautical_Twilight": "Day", "Astronomical_Twilight": "Day"
    },
    {
        "ID": "A-12346", "Source": "Source_B", "Severity": None,
        "Start_Time": "2024-07-22 08:00:00", "End_Time": "2024-07-22 08:15:00",
        "Start_Lat": 38.9072, "Start_Lng": -77.0370, "End_Lat": 38.9075, "End_Lng": -77.0372,
        "Distance(mi)": 0.02, "Description": "Minor collision on K Street.",
        "Street": "K Street NW", "City": "Washington", "County": "District of Columbia", "State": "DC",
        "Zipcode": "20005", "Country": "US", "Timezone": "US/Eastern", "Airport_Code": "KDCA",
        "Weather_Timestamp": "2024-07-22 07:45:00", "Temperature(F)": 75.0, "Wind_Chill(F)": 72.0,
        "Humidity(%)": 70.0, "Pressure(in)": 30.1, "Visibility(mi)": 8.0, "Wind_Direction": "E",
        "Wind_Speed(mph)": 5.0, "Precipitation(in)": 0.0, "Weather_Condition": "Partly Cloudy",
        "Amenity": False, "Bump": False, "Crossing": False, "Give_Way": False, "Junction": True,
        "No_Exit": False, "Railway": False, "Roundabout": False, "Station": True, "Stop": False,
        "Traffic_Calming": False, "Traffic_Signal": False, "Turning_Loop": False, "Sunrise_Sunset": "Day",
        "Civil_Twilight": "Day", "Nautical_Twilight": "Day", "Astronomical_Twilight": "Day"
    },
    # İstediğiniz kadar yeni veri ekleyebilirsiniz
]

# Yeni veriyi Spark DataFrame'ine dönüştür
df_new_data_raw = spark.createDataFrame(veri_yeni)

# Yeni veriyi modelin beklediği ham özellik formatına hazırla
df_new_data_prepared = prepare_new_data(df_new_data_raw, top_n=32)

# --- Tahmin Yap ---
print("\n--- Yeni Veri Üzerinde Tahmin Yapılıyor ---")
if loaded_pipeline_model:
    predictions_new_data = loaded_pipeline_model.transform(df_new_data_prepared) \
                                                .select("City_Cleaned", "Street_Cleaned", "prediction", "probability") # İstenen çıktılar
    
    print("\nYeni Veri Tahmin Sonuçları:")
    predictions_new_data.show(truncate=False)
else:
    print("HATA: Pipeline modeli yüklenemediği için tahmin yapılamadı.")

# 8) Spark'ı durdur
spark.stop()

[DEBUG] Kaydedilmiş PipelineModel yükleniyor: c:\Users\aslay\Desktop\BİL 401\project\models\us_accidents_dt_final_last_full_spark


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 29.0 failed 1 times, most recent failure: Lost task 0.0 in stage 29.0 (TID 44) (MSI executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:789)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	... 32 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:181)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:789)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	... 32 more


## Duration

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, radians, sin, cos, sqrt, atan2

# 1) Spark’ı başlat
spark = SparkSession.builder \
    .appName("Compute Distance") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

# 2) Ham CSV’yi oku ve orijinal mesafe sütununu at
df = spark.read.csv("data/traffic_data/US_Accidents_March23.csv", header=True, inferSchema=True)

# 3) Koordinatlardan herhangi biri boşsa o satırı at
df_clean = df.dropna(subset=["Start_Lat", "Start_Lng", "End_Lat", "End_Lng"])

# 4) Haversine ile yeni distance_mi hesapla
from pyspark.sql.functions import col, radians, sin, cos, sqrt, atan2

df_dist = df_clean.withColumn("lat1", radians(col("Start_Lat"))) \
    .withColumn("lon1", radians(col("Start_Lng"))) \
    .withColumn("lat2", radians(col("End_Lat"))) \
    .withColumn("lon2", radians(col("End_Lng"))) \
    .withColumn("dlat", col("lat2") - col("lat1")) \
    .withColumn("dlon", col("lon2") - col("lon1")) \
    .withColumn("a", sin(col("dlat")/2)**2 + cos(col("lat1"))*cos(col("lat2"))*sin(col("dlon")/2)**2) \
    .withColumn("c", 2*atan2(sqrt(col("a")), sqrt(1-col("a")))) \
    .withColumn("distance_mi", col("c") * 3958.8) \
    .drop("lat1","lon1","lat2","lon2","dlat","dlon","a","c")

# 5) Kontrol
df_dist.select("Start_Lat","Start_Lng","End_Lat","End_Lng","distance_mi").show(5)


+---------+---------+--------+------------------+-------------------+
|Start_Lat|Start_Lng| End_Lat|           End_Lng|        distance_mi|
+---------+---------+--------+------------------+-------------------+
| 40.10891|-83.09286|40.11206|         -83.03187| 3.2302598454049494|
| 39.86542| -84.0628|39.86501|         -84.04873|  0.746718600847293|
| 39.10266|-84.52468|39.10209|         -84.52396|0.05514922259749175|
| 39.10148|-84.52341|39.09841|         -84.52241|0.21879110549002928|
| 41.06213|-81.53784|41.06217|-81.53546999999998|0.12350028305173481|
+---------+---------+--------+------------------+-------------------+
only showing top 5 rows



In [2]:
# ... önceki adımlar: df_dist hazırlandı

# 6) Artık eğitimde kullanmayacağımız koordinat sütunlarını at
cols_to_drop = ["Start_Lat", "Start_Lng", "End_Lat", "End_Lng","Distance(mi)"]
df_final = df_dist.drop(*cols_to_drop)
# Kontrol
df_final.printSchema()
df_final.select("distance_mi").show(5)


root
 |-- ID: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Severity: integer (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Description: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- Airport_Code: string (nullable = true)
 |-- Weather_Timestamp: timestamp (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- Visibility(mi): double (nullable = true)
 |-- Wind_Direction: string (nullable = true)
 |-- Wind_Speed(mph): double (nullable = true)
 |-- Precipitation(in): double (nullable = true)
 |-- Weather_Condition

In [3]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# 1) %10 örnek al
#sample_df = df_final.sample(withReplacement=False, seed=42).cache()

# 2) Kullancağımız sayısal feature’lar
numeric_feats = [
    "Temperature(F)",
    "Humidity(%)",
    "Pressure(in)",
    "Visibility(mi)",
    "Wind_Speed(mph)"
]

# 3) Train/test split
train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)

# 4) Pipeline adımları
assembler = VectorAssembler(
    inputCols=numeric_feats,
    outputCol="features",
    handleInvalid="skip"   # eksik değer içeren satırları at
)
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="distance_mi",
    numTrees=100,
    maxDepth=7
)
pipeline = Pipeline(stages=[assembler, rf])

# 5) Modeli eğit (null’ları assembler zaten atıyor)
model = pipeline.fit(train_df)

# 6) Test set’inde tahmin & değerlendirme
pred = model.transform(test_df)
evaluator = RegressionEvaluator(
    labelCol="distance_mi",
    predictionCol="prediction",
    metricName="rmse"
)
print(f"Test RMSE = {evaluator.evaluate(pred):.3f}")
print(f"Test MAE  = {evaluator.setMetricName('mae').evaluate(pred):.3f}")
print(f"Test R2   = {evaluator.setMetricName('r2').evaluate(pred):.3f}")


Test RMSE = 1.810
Test MAE  = 0.880
Test R2   = 0.023


In [4]:
from pyspark.sql.functions import hour, dayofweek, month, when
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# 1) Zaman özellikleri ekleyelim
df2 = df_dist \
    .withColumn("hour", hour("Start_Time")) \
    .withColumn("day_of_week", dayofweek("Start_Time")) \
    .withColumn("month", month("Start_Time")) \
    .withColumn("is_weekend", when(dayofweek("Start_Time").isin([1,7]), 1).otherwise(0))

# 2) Kategorikleri sayısallaştır: Weather_Condition, City, State
idx_weather = StringIndexer(inputCol="Weather_Condition", outputCol="weather_idx", handleInvalid="keep")
enc_weather = OneHotEncoder(inputCol="weather_idx", outputCol="weather_vec")

idx_city    = StringIndexer(inputCol="City",              outputCol="city_idx",    handleInvalid="keep")
enc_city    = OneHotEncoder(inputCol="city_idx",          outputCol="city_vec")

idx_state   = StringIndexer(inputCol="State",             outputCol="state_idx",   handleInvalid="keep")
enc_state   = OneHotEncoder(inputCol="state_idx",         outputCol="state_vec")

# 3) Assemble edilecek feature’lar
numeric_feats = ["Temperature(F)", "Humidity(%)", "Pressure(in)",
                 "Visibility(mi)", "Wind_Speed(mph)", "distance_mi",
                 "hour", "day_of_week", "month", "is_weekend"]

assembler = VectorAssembler(
    inputCols=numeric_feats + ["weather_vec", "city_vec", "state_vec"],
    outputCol="features",
    handleInvalid="keep"
)

# 4) Regresyon modelinizi yeniden kurun
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="distance_mi", featuresCol="features",
                           numTrees=50, maxDepth=10, seed=42)

pipeline = Pipeline(stages=[
    idx_weather, enc_weather,
    idx_city,    enc_city,
    idx_state,   enc_state,
    assembler, rf
])



In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sin, cos, atan2, sqrt, radians
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# 1) SparkSession (gerekirse timeout ve worker reuse ayarları da eklenebilir)
spark = (SparkSession.builder
         .appName("DistancePrediction_NoUDF")
         .config("spark.python.worker.reuse", "true")
         .config("spark.executor.memory", "8g")
         .getOrCreate())

# 2) Veri yükle
df = spark.read.csv("data/traffic_data/US_Accidents_March23.CSV", header=True, inferSchema=True)

# 3) Haversine’i direkt Spark SQL ile
R = 3958.8
df2 = (df.dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng"])
          .withColumn("φ1", radians(col("Start_Lat")))
          .withColumn("φ2", radians(col("End_Lat")))
          .withColumn("Δφ", radians(col("End_Lat") - col("Start_Lat")))
          .withColumn("Δλ", radians(col("End_Lng") - col("Start_Lng")))
          .withColumn("a",
               sin(col("Δφ")/2) * sin(col("Δφ")/2) +
               cos(col("φ1")) * cos(col("φ2")) *
               sin(col("Δλ")/2) * sin(col("Δλ")/2)
          )
          .withColumn("distance_mi", 2 * R * atan2(sqrt(col("a")), sqrt(1-col("a"))))
          # Gerekmeyen sütunları at
          .drop("φ1","φ2","Δφ","Δλ","a","Distance(mi)",
                "Start_Lat","Start_Lng","End_Lat","End_Lng")
)

# 4) Özellikleri seç ve boşları at
num_feats = ["Temperature(F)","Wind_Chill(F)","Humidity(%)",
             "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
             "Precipitation(in)"]
df_final = df2.select(*(num_feats + ["distance_mi"])).na.drop()

# 5) %10 ile örnek al, train/test split
df_sample = df_final.sample(False, 0.1, seed=42)
train, test   = df_sample.randomSplit([0.8,0.2], seed=42)

# 6) Pipeline tanımı
assembler = VectorAssembler(inputCols=num_feats, outputCol="features", handleInvalid="skip")
rf        = RandomForestRegressor(labelCol="distance_mi", featuresCol="features",
                                  numTrees=10, maxDepth=5)
pipeline  = Pipeline(stages=[assembler, rf])

# 7) Eğit ve değerlendir
model = pipeline.fit(train)
pred  = model.transform(test)

evaluator = RegressionEvaluator(labelCol="distance_mi", predictionCol="prediction")
print("RMSE =", evaluator.evaluate(pred, {evaluator.metricName:"rmse"}))
print("MAE  =", evaluator.evaluate(pred, {evaluator.metricName:"mae"}))
print("R2   =", evaluator.evaluate(pred, {evaluator.metricName:"r2"}))

spark.stop()


RMSE = 1.6668868671419903
MAE  = 0.8987651630937321
R2   = 0.01991324161864938


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, radians, sin, cos, atan2, sqrt
)
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler
)
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# 1) SparkSession oluştur
spark = (SparkSession.builder
         .appName("DistancePrediction_NoTimeFeatures")
         .config("spark.python.worker.reuse", "true")
         .config("spark.executor.memory", "8g")
         .getOrCreate())

# 2) CSV’i oku (header ve tipleri kendine göre ayarla)
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv"))

# 3) Koordinat boşlarını at
df = df.dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng"])

# 4) Haversine ile distance hesapla (mile cinsinden)
#    R = 3958.8 mile
df = (df
      .withColumn("lat1", radians(col("Start_Lat")))
      .withColumn("lon1", radians(col("Start_Lng")))
      .withColumn("lat2", radians(col("End_Lat")))
      .withColumn("lon2", radians(col("End_Lng")))
      .withColumn("dlat", col("lat2")-col("lat1"))
      .withColumn("dlon", col("lon2")-col("lon1"))
      .withColumn("a",
          sin(col("dlat")/2)**2 +
          cos(col("lat1")) * cos(col("lat2")) * sin(col("dlon")/2)**2
      )
      .withColumn("c", 2 * atan2(sqrt(col("a")), sqrt(1-col("a"))))
      .withColumn("distance_mi", col("c")*3958.8)
)

# 5) Artık eğitimde kullanmayacağımız sütunları at
df_final = (df
    .drop(
      "Distance(mi)",
      "Start_Lat","Start_Lng","End_Lat","End_Lng",
      "lat1","lat2","lon1","lon2","dlat","dlon","a","c",
      "Start_Time","End_Time","Weather_Timestamp"  # zaman sütunları
    )
    .na.drop(subset=["distance_mi"])  # label boş kalmasın
)

# 6) %10 örnek al
df_sample = df_final.sample(False, 0.1, seed=42)

# 7) Pipeline aşamaları
#   - Kategorik: Severity, Weather_Condition
cats = ["Severity","Weather_Condition"]
indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep")
    for c in cats
]
encoders = [
    OneHotEncoder(inputCol=c+"_idx", outputCol=c+"_vec")
    for c in cats
]

#   - Numerik feature’lar
num_feats = [
    "Temperature(F)","Humidity(%)","Pressure(in)",
    "Wind_Speed(mph)","Visibility(mi)"
]

assembler = VectorAssembler(
    inputCols=num_feats + [c+"_vec" for c in cats],
    outputCol="features",
    handleInvalid="skip"
)

rf = RandomForestRegressor(
    labelCol="distance_mi",
    featuresCol="features",
    numTrees=50,
    maxDepth=8
)

pipeline = Pipeline(stages=indexers + encoders + [assembler, rf])

# 8) Train/Test split + eğitim
train, test = df_sample.randomSplit([0.8,0.2], seed=42)
model = pipeline.fit(train.na.drop())

# 9) Tahmin & değerlendirme
pred = model.transform(test.na.drop())
evaluator = RegressionEvaluator(
    labelCol="distance_mi",
    predictionCol="prediction"
)
print("Test RMSE =", evaluator.evaluate(pred, {evaluator.metricName:"rmse"}))
print("Test MAE  =", evaluator.evaluate(pred, {evaluator.metricName:"mae"}))
print("Test R2   =", evaluator.evaluate(pred, {evaluator.metricName:"r2"}))


Test RMSE = 1.825742950402607
Test MAE  = 0.9041405026802174
Test R2   = 0.02143761243157094


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, radians, sin, cos, atan2, sqrt
)
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler
)
from pyspark.ml.clustering import KMeans
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

import findspark
findspark.init()

# 1) SparkSession
spark = (SparkSession.builder
         .appName("DistancePrediction_FeatsOnly")
         .config("spark.python.worker.reuse", "true")
         .config("spark.executor.memory", "16g")
         .getOrCreate())

# 2) Veriyi oku (path’i kendi dosyana göre değiştir)
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv"))

# 3) Boş koordinatları at ve Haversine formülü ile distance_mi hesapla
#    Earth radius in miles ~3958.8
df_dist = (df.dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng"])
    .withColumn("lat1", radians(col("Start_Lat")))
    .withColumn("lon1", radians(col("Start_Lng")))
    .withColumn("lat2", radians(col("End_Lat")))
    .withColumn("lon2", radians(col("End_Lng")))
    .withColumn("dlat", col("lat2") - col("lat1"))
    .withColumn("dlon", col("lon2") - col("lon1"))
    .withColumn("a",
        sin(col("dlat")/2)**2 +
        cos(col("lat1")) * cos(col("lat2")) * sin(col("dlon")/2)**2
    )
    .withColumn("c", 2 * atan2(sqrt(col("a")), sqrt(1-col("a"))))
    .withColumn("distance_mi", col("c") * 3958.8)
    .drop("lat1","lat2","lon1","lon2","dlat","dlon","a","c")
)

# 4) Başlangıç/bitiş kümeleme
assembler_k1 = VectorAssembler(
    inputCols=["Start_Lat","Start_Lng"], outputCol="start_coord")
k1 = KMeans(k=50, featuresCol="start_coord",
            predictionCol="start_cluster", seed=42)
assembler_k2 = VectorAssembler(
    inputCols=["End_Lat","End_Lng"], outputCol="end_coord")
k2 = KMeans(k=50, featuresCol="end_coord",
            predictionCol="end_cluster", seed=42)

km_pipe = Pipeline(stages=[assembler_k1, k1, assembler_k2, k2])
df_feat = km_pipe.fit(df_dist).transform(df_dist)

# 5) Koordinat sütunlarını at
df_feat = df_feat.drop(
    "Start_Lat","Start_Lng","End_Lat","End_Lng",
    "start_coord","end_coord"
)

# 6) Kategorik → index + one‑hot
cats = ["Severity","Weather_Condition","start_cluster","end_cluster"]
indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep")
    for c in cats
]
encoders = [
    OneHotEncoder(inputCol=c+"_idx", outputCol=c+"_vec")
    for c in cats
]

# 7) Sayısal feature’lar + ohe kolonları
num_feats = [
    "Temperature(F)","Humidity(%)","Pressure(in)",
    "Wind_Speed(mph)","Visibility(mi)",
    "distance_mi"  # hedefi de features’a koyuyoruz, ama RF label olarak ayrıştıracak
]
assembler = VectorAssembler(
    inputCols=num_feats + [c+"_vec" for c in cats],
    outputCol="features",
    handleInvalid="skip"
)

# 8) Model
rf = RandomForestRegressor(
    labelCol="distance_mi",
    featuresCol="features",
    numTrees=50,
    maxDepth=8,
    seed=42
)

pipe = Pipeline(stages=indexers + encoders + [assembler, rf])
# 9a) %10 örnek al
df_small = df_feat \
    .na.drop(subset=["distance_mi"]) \
    .sample(withReplacement=False, fraction=0.1, seed=42) \
    .cache()

# 9b) Örnekten split
train, test = df_small.randomSplit([0.8,0.2], seed=42)

# 9c) Modeli küçük örnekle eğit
model = pipe.fit(train)
pred   = model.transform(test)


# 10) Değerlendir
evaluator = RegressionEvaluator(
    labelCol="distance_mi",
    predictionCol="prediction"
)
print("RMSE =", evaluator.evaluate(pred, {evaluator.metricName:"rmse"}))
print("MAE  =", evaluator.evaluate(pred, {evaluator.metricName:"mae"}))
print("R2   =", evaluator.evaluate(pred, {evaluator.metricName:"r2"}))


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Users\aslay\bert_env\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\aslay\AppData\Local\Temp\ipykernel_30772\1778831836.py", line 109, in <module>
    model = pipe.fit(train)
            ^^^^^^^^^^^^^^^
  File "c:\Users\aslay\bert_env\Lib\site-packages\pyspark\ml\base.py", line 205, in fit
    return self._fit(dataset)
           ^^^^^^^^^^^^^^^^^^
  File "c:\Users\aslay\bert_env\Lib\site-packages\pyspark\ml\pipeline.py", line 134, in _fit
    model = stage.fit(dataset)
            ^^^^^^^^^^^^^^^^^^
  File "c:\Users\aslay\bert_env\Lib\site-packages\pyspark\ml\base.py", line 205, in fit
    return self._fit(dataset)
           ^^^^^^^^^^^^^^^^^^
  File "c:\Users\aslay\bert_env\Lib\site-packages\pyspark\ml\wrapper.py", line 381, in _fit
    java_model = self._fit_java(dataset)
                 ^^^^^^

ConnectionRefusedError: [WinError 10061] Hedef makine etkin olarak reddettiğinden bağlantı kurulamadı

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, radians, sin, cos, atan2, sqrt
from pyspark.ml.feature import VectorAssembler, Bucketizer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# --- 0) SparkSession başlat ---
spark = (SparkSession.builder
         .appName("DistanceClassification_SampleTest")
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory",   "4g")
         .config("spark.python.worker.reuse", "true")
         .getOrCreate())

# --- 1) Veriyi yükle ---
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv"))

# --- 2) Haversine ile distance_mi hesapla ---
R = 3958.8
df2 = (df.dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng"])
         .withColumn("φ1", radians(col("Start_Lat")))
         .withColumn("φ2", radians(col("End_Lat")))
         .withColumn("Δφ", radians(col("End_Lat") - col("Start_Lat")))
         .withColumn("Δλ", radians(col("End_Lng") - col("Start_Lng")))
         .withColumn("a",
             sin(col("Δφ")/2) * sin(col("Δφ")/2) +
             cos(col("φ1")) * cos(col("φ2")) *
             sin(col("Δλ")/2) * sin(col("Δλ")/2)
         )
         .withColumn("distance_mi", 2 * R * atan2(sqrt(col("a")), sqrt(1-col("a"))))
         .drop("φ1","φ2","Δφ","Δλ","a",
               "Start_Lat","Start_Lng","End_Lat","End_Lng","Distance(mi)"))

# --- 3) IQR ile outlier temizleme ---
q1, q3 = df2.approxQuantile("distance_mi", [0.25, 0.75], 1e-4)
iqr    = q3 - q1
lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
df_clean = df2.filter((col("distance_mi") >= lower) & (col("distance_mi") <= upper))

# --- 4) Tüm verinin %10’u ile örnek al ---
df_sample = df_clean.sample(withReplacement=False, fraction=0.1, seed=42)

# --- 5) Sample üzerinde train/test split ---
train_sample, test_sample = df_sample.randomSplit([0.8, 0.2], seed=42)

# --- 6) Quintile bazlı bucket eşikleri (sadece train_sample’da) ---
splits = train_sample.approxQuantile("distance_mi",
                                     [0.0,0.2,0.4,0.6,0.8,1.0],
                                     1e-4)
splits[0], splits[-1] = float("-inf"), float("inf")
bucketizer = Bucketizer(inputCol="distance_mi",
                        outputCol="distance_class",
                        splits=splits)

# --- 7) Özellikleri birleştir ---
num_feats = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)",
    "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
    "Precipitation(in)"
]
assembler = VectorAssembler(inputCols=num_feats,
                            outputCol="features",
                            handleInvalid="skip")

# --- 8) Hafif Random Forest Classifier ---
rf = (RandomForestClassifier(labelCol="distance_class",
                             featuresCol="features")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setSubsamplingRate(0.7)
      .setSeed(42))

# --- 9) Pipeline oluştur ---
pipe = Pipeline(stages=[bucketizer, assembler, rf])

# --- 10) Model eğit & tahmin ---
model = pipe.fit(train_sample)
pred  = model.transform(test_sample)

# --- 11) Değerlendirme ---
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="accuracy")
evaluator_f1  = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="f1")

print("Sample üzerinde Accuracy =", evaluator_acc.evaluate(pred))
print("Sample üzerinde F1 Score =", evaluator_f1.evaluate(pred))

# --- 12) SparkSession’ı kapat ---
spark.stop()


Sample üzerinde Accuracy = 0.2522689101526988
Sample üzerinde F1 Score = 0.19301675454757888


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, count, isnan,  # isnan eklendi
    radians, sin, cos, atan2, sqrt
)
from pyspark.ml.feature import Bucketizer
from pyspark.sql.types import DoubleType, FloatType

# --- A) SparkSession başlat ---
spark = (SparkSession.builder
         .appName("DistanceClassification_EDA_FixMissing")
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory",   "4g")
         .config("spark.python.worker.reuse", "true")
         .getOrCreate())

# --- B) Veri hazırlığı ---
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv"))

R = 3958.8
df2 = (df.dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng"])
         .withColumn("φ1", radians(col("Start_Lat")))
         .withColumn("φ2", radians(col("End_Lat")))
         .withColumn("Δφ", radians(col("End_Lat") - col("Start_Lat")))
         .withColumn("Δλ", radians(col("End_Lng") - col("Start_Lng")))
         .withColumn("a",
             sin(col("Δφ")/2) * sin(col("Δφ")/2) +
             cos(col("φ1")) * cos(col("φ2")) *
             sin(col("Δλ")/2) * sin(col("Δλ")/2)
         )
         .withColumn("distance_mi", 2 * R * atan2(sqrt(col("a")), sqrt(1-col("a"))))
         .drop("φ1","φ2","Δφ","Δλ","a",
               "Start_Lat","Start_Lng","End_Lat","End_Lng","Distance(mi)"))

# IQR ile outlier’ları çıkar
q1, q3 = df2.approxQuantile("distance_mi", [0.25, 0.75], 1e-4)
iqr    = q3 - q1
lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
df_clean = df2.filter((col("distance_mi") >= lower) & (col("distance_mi") <= upper))

# --- C) Genel istatistikler ---
print("🔹 Toplam temize düşen satır sayısı:", df_clean.count())
df_clean.select("distance_mi").describe().show()

num_feats = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)",
    "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
    "Precipitation(in)"
]
df_clean.select(num_feats).describe().show()

# --- D) distance_mi quantile ve binleme ---
quantiles = df_clean.approxQuantile("distance_mi",
                                    [0.0,0.2,0.4,0.6,0.8,1.0],
                                    1e-4)
print("🔹 distance_mi quantile değerleri:", quantiles)

splits = quantiles[:]
splits[0], splits[-1] = float("-inf"), float("inf")
bucketizer = Bucketizer(inputCol="distance_mi",
                        outputCol="distance_class_temp",
                        splits=splits)
df_binned = bucketizer.transform(df_clean)
df_binned.groupBy("distance_class_temp") \
         .count() \
         .orderBy("distance_class_temp") \
         .show(truncate=False)

# --- E) Eksik değer analizi (tip bazlı) ---
from pyspark.sql.types import DoubleType, FloatType

# Sayısal sütunlar
num_cols = [f.name for f in df_clean.schema.fields 
            if isinstance(f.dataType, (DoubleType, FloatType))]
# Diğerleri
other_cols = [c for c in df_clean.columns if c not in num_cols]

print("🔹 Sayısal sütunlardaki eksikler:")
df_clean.select([
    count(when(col(c).isNull() | isnan(col(c)), c)).alias(c)
    for c in num_cols
]).show(truncate=False)

print("🔹 Diğer sütunlardaki eksikler:")
df_clean.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in other_cols
]).show(truncate=False)

# --- F) Spark’ı kapat ---
spark.stop()


🔹 Toplam temize düşen satır sayısı: 3892385
+-------+-------------------+
|summary|        distance_mi|
+-------+-------------------+
|  count|            3892385|
|   mean| 0.4257840179197078|
| stddev|0.49089888749527927|
|    min|                0.0|
|    max| 2.1313033514150135|
+-------+-------------------+

+-------+------------------+------------------+------------------+-----------------+-----------------+-----------------+--------------------+
|summary|    Temperature(F)|     Wind_Chill(F)|       Humidity(%)|     Pressure(in)|   Visibility(mi)|  Wind_Speed(mph)|   Precipitation(in)|
+-------+------------------+------------------+------------------+-----------------+-----------------+-----------------+--------------------+
|  count|           3799394|           3392355|           3793813|          3812577|          3794595|          3704724|             3313976|
|   mean|61.366199425487395|59.328386474882244| 63.57699206576603|29.44533834726443|9.119675472613036|7.5552168258689

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, radians, sin, cos, atan2, sqrt
from pyspark.sql.types import DoubleType, FloatType
from pyspark.ml.feature import (
    Bucketizer, Imputer, VectorAssembler, StandardScaler
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark import StorageLevel

# --- 0) SparkSession ---
spark = (SparkSession.builder
         .appName("DistanceClassification_WithImputeScale")
         .config("spark.executor.memory", "12g")
         .config("spark.driver.memory",   "8g")
         .config("spark.python.worker.reuse", "true")
         .getOrCreate())

# --- 1) Veri yükle & temizle (aynı adımlar) ---
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv"))

R = 3958.8
df2 = (df.dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng"])
         .withColumn("φ1", radians(col("Start_Lat")))
         .withColumn("φ2", radians(col("End_Lat")))
         .withColumn("Δφ", radians(col("End_Lat") - col("Start_Lat")))
         .withColumn("Δλ", radians(col("End_Lng") - col("Start_Lng")))
         .withColumn("a",
             sin(col("Δφ")/2) * sin(col("Δφ")/2) +
             cos(col("φ1")) * cos(col("φ2")) *
             sin(col("Δλ")/2) * sin(col("Δλ")/2)
         )
         .withColumn("distance_mi", 2 * R * atan2(sqrt(col("a")), sqrt(1-col("a"))))
         .drop("φ1","φ2","Δφ","Δλ","a",
               "Start_Lat","Start_Lng","End_Lat","End_Lng","Distance(mi)"))
q1, q3 = df2.approxQuantile("distance_mi", [0.25,0.75], 1e-4)
iqr    = q3 - q1
lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
df_clean = df2.filter((col("distance_mi") >= lower) & (col("distance_mi") <= upper))

# cache’lenmiş eğitim verisi
train_full, test_full = df_clean.randomSplit([0.8,0.2], seed=42)
train_cached = train_full.repartition(200).persist(StorageLevel.MEMORY_AND_DISK)
train_cached.count()

# --- 2) Bucketizer (quantile’lar aynen) ---
splits = train_cached.approxQuantile("distance_mi",
                                     [0.0,0.2,0.4,0.6,0.8,1.0],
                                     1e-4)
splits[0], splits[-1] = float("-inf"), float("inf")
bucketizer = Bucketizer(inputCol="distance_mi",
                        outputCol="distance_class",
                        splits=splits)

# --- 3) Eksik değer doldurucu ---
num_feats = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)",
    "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
    "Precipitation(in)"
]
imputer = (Imputer()
           .setInputCols(num_feats)
           .setOutputCols([f"imp_{c}" for c in num_feats])
           .setStrategy("median"))

# --- 4) Assembler & Scaler ---
assembler = VectorAssembler(
    inputCols=[f"imp_{c}" for c in num_feats],
    outputCol="unscaled_features",
    handleInvalid="skip"
)
scaler = StandardScaler(
    inputCol="unscaled_features",
    outputCol="features",
    withMean=True, withStd=True
)

# --- 5) Random Forest ---
rf = (RandomForestClassifier(labelCol="distance_class",
                             featuresCol="features")
      .setNumTrees(25)
      .setMaxDepth(8)
      .setSubsamplingRate(0.8)
      .setSeed(42))

# --- 6) Pipeline & Eğit ---
pipe = Pipeline(stages=[
    bucketizer,
    imputer,
    assembler,
    scaler,
    rf
])
model = pipe.fit(train_cached)
pred  = model.transform(test_full)

# --- 7) Değerlendirme ---
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="accuracy")
evaluator_f1  = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="f1")

print("Impute+Scale Accuracy =", evaluator_acc.evaluate(pred))
print("Impute+Scale F1 =", evaluator_f1.evaluate(pred))

spark.stop()


Impute+Scale Accuracy = 0.2701567450499026
Impute+Scale F1 = 0.24786260192762977


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, radians, sin, cos, atan2, sqrt,
    hour, dayofweek
)
from pyspark.ml.feature import (
    Bucketizer, Imputer, VectorAssembler, StandardScaler
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark import StorageLevel

# --- 0) SparkSession ---
spark = (SparkSession.builder
         .appName("DistanceClassification_3Class_TimeFeatures")
         .config("spark.executor.memory","12g")
         .config("spark.driver.memory","8g")
         .config("spark.python.worker.reuse","true")
         .getOrCreate())

# --- 1) Veri yükle ve distance + time özelliklerini oluştur ---
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv"))

R = 3958.8
df2 = (df
       .dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng"])
       .withColumn("φ1", radians(col("Start_Lat")))
       .withColumn("φ2", radians(col("End_Lat")))
       .withColumn("Δφ", radians(col("End_Lat") - col("Start_Lat")))
       .withColumn("Δλ", radians(col("End_Lng") - col("Start_Lng")))
       .withColumn("a",
           sin(col("Δφ")/2) * sin(col("Δφ")/2) +
           cos(col("φ1")) * cos(col("φ2")) *
           sin(col("Δλ")/2) * sin(col("Δλ")/2)
       )
       .withColumn("distance_mi", 2 * R * atan2(sqrt(col("a")), sqrt(1-col("a"))))
       # Zaman özellikleri
       .withColumn("hour",    hour(col("Start_Time")))
       .withColumn("weekday", dayofweek(col("Start_Time")))
       .drop("φ1","φ2","Δφ","Δλ","a",
             "Start_Lat","Start_Lng","End_Lat","End_Lng","Distance(mi)"))

# --- 2) Outlier temizleme (IQR) ---
q1, q3 = df2.approxQuantile("distance_mi", [0.25, 0.75], 1e-4)
iqr    = q3 - q1
lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
df_clean = df2.filter((col("distance_mi") >= lower) & (col("distance_mi") <= upper))

# --- 3) Train/Test split & cache ---
train_full, test_full = df_clean.randomSplit([0.8, 0.2], seed=42)
train_cached = train_full.repartition(200).persist(StorageLevel.MEMORY_AND_DISK)
train_cached.count()

# --- 4) 3-class bucket eşikleri (%33, %66 quantile) ---
qs = train_cached.approxQuantile("distance_mi", [0.33, 0.66], 1e-4)
splits = [float("-inf"), qs[0], qs[1], float("inf")]
bucketizer = Bucketizer(inputCol="distance_mi",
                        outputCol="distance_class",
                        splits=splits)

# --- 5) Eksik değer doldurucu (median ile) ---
num_feats = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)",
    "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
    "Precipitation(in)"
]
imputer = (Imputer()
           .setInputCols(num_feats)
           .setOutputCols([f"imp_{c}" for c in num_feats])
           .setStrategy("median"))

# --- 6) Assembler & Scaler (time özellikleri de ekle) ---
assembler = VectorAssembler(
    inputCols=[f"imp_{c}" for c in num_feats] + ["hour", "weekday"],
    outputCol="unscaled_features",
    handleInvalid="skip"
)
scaler = StandardScaler(
    inputCol="unscaled_features",
    outputCol="features",
    withMean=True,
    withStd=True
)

# --- 7) Random Forest Model ---
rf = (RandomForestClassifier(labelCol="distance_class",
                             featuresCol="features")
      .setNumTrees(50)
      .setMaxDepth(8)
      .setSeed(42))

# --- 8) Pipeline & Eğitim ---
pipe = Pipeline(stages=[
    bucketizer,
    imputer,
    assembler,
    scaler,
    rf
])
model = pipe.fit(train_cached)
pred  = model.transform(test_full)

# --- 9) Değerlendirme ---
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="distance_class",
    predictionCol="prediction",
    metricName="accuracy")
evaluator_f1  = MulticlassClassificationEvaluator(
    labelCol="distance_class",
    predictionCol="prediction",
    metricName="f1")

print("3-class + Time Accuracy =", evaluator_acc.evaluate(pred))
print("3-class + Time F1 =", evaluator_f1.evaluate(pred))

spark.stop()


3-class + Time Accuracy = 0.40093571460704486
3-class + Time F1 = 0.3886517222673759


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, radians, sin, cos, atan2, sqrt,
    hour, dayofweek
)
from pyspark.ml.feature import (
    Bucketizer,
    StringIndexer, OneHotEncoder,
    VectorAssembler, StandardScaler
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark import StorageLevel

# --- 0) SparkSession ---
spark = (SparkSession.builder
         .appName("DistanceClassification_DropNa")
         .config("spark.executor.memory","12g")
         .config("spark.driver.memory","8g")
         .config("spark.python.worker.reuse","true")
         .getOrCreate())

# --- 1) Veri yükle & distance + time + weather features ---
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv"))

R = 3958.8
df2 = (df
       .dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng"])
       .withColumn("φ1", radians(col("Start_Lat")))
       .withColumn("φ2", radians(col("End_Lat")))
       .withColumn("Δφ", radians(col("End_Lat") - col("Start_Lat")))
       .withColumn("Δλ", radians(col("End_Lng") - col("Start_Lng")))
       .withColumn("a",
           sin(col("Δφ")/2)*sin(col("Δφ")/2) +
           cos(col("φ1"))*cos(col("φ2")) *
           sin(col("Δλ")/2)*sin(col("Δλ")/2)
       )
       .withColumn("distance_mi", 2*R*atan2(sqrt(col("a")), sqrt(1-col("a"))))
       .withColumn("hour",    hour(col("Start_Time")))
       .withColumn("weekday", dayofweek(col("Start_Time")))
       .drop("φ1","φ2","Δφ","Δλ","a",
             "Start_Lat","Start_Lng","End_Lat","End_Lng","Distance(mi)"))

# --- 2) Outlier temizleme (IQR) ---
q1, q3 = df2.approxQuantile("distance_mi",[0.25,0.75],1e-4)
iqr    = q3 - q1
lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
df_clean = df2.filter((col("distance_mi")>=lower)&(col("distance_mi")<=upper))

# --- 3) NaN’leri at (sadece sayısal feature’larda) ---
num_feats = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)",
    "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
    "Precipitation(in)"
]
df_no_na = df_clean.na.drop(subset=num_feats)

# --- 4) Split & cache ---
train_full, test_full = df_no_na.randomSplit([0.8,0.2], seed=42)
train_cached = train_full.repartition(200).persist(StorageLevel.MEMORY_AND_DISK)
train_cached.count()

# --- 5) 3-class bucket (%33, %66 quantile) ---
qs = train_cached.approxQuantile("distance_mi",[0.33,0.66],1e-4)
splits = [float("-inf"), qs[0], qs[1], float("inf")]
bucketizer = Bucketizer(inputCol="distance_mi",
                        outputCol="distance_class",
                        splits=splits)

# --- 6) Weather_Condition encode ---
indexer = StringIndexer(inputCol="Weather_Condition",
                        outputCol="wc_index",
                        handleInvalid="keep")
encoder = OneHotEncoder(inputCols=["wc_index"],
                        outputCols=["wc_vec"],
                        dropLast=False)

# --- 7) Assembler & Scaler (time + numeric + weather) ---
assembler = VectorAssembler(
    inputCols=num_feats + ["hour","weekday","wc_vec"],
    outputCol="unscaled_features",
    handleInvalid="skip"
)
scaler = StandardScaler(inputCol="unscaled_features",
                        outputCol="features",
                        withMean=True, withStd=True)

# --- 8) RandomForest ---
rf = (RandomForestClassifier(labelCol="distance_class",
                             featuresCol="features")
      .setNumTrees(80)
      .setMaxDepth(10)
      .setSeed(42))

# --- 9) Pipeline & Eğitim ---
pipe = Pipeline(stages=[
    bucketizer,
    indexer, encoder,
    assembler,
    scaler,
    rf
])
model = pipe.fit(train_cached)
pred  = model.transform(test_full)

# --- 10) Değerlendirme ---
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="accuracy")
evaluator_f1  = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="f1")

print("DropNa + Weather – Accuracy =", evaluator_acc.evaluate(pred))
print("DropNa + Weather – F1 =", evaluator_f1.evaluate(pred))

spark.stop()


DropNa + Weather – Accuracy = 0.39598413709239194
DropNa + Weather – F1 = 0.3932116074730015


In [None]:
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, radians, sin, cos, atan2, sqrt,
    hour, dayofweek
)
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder,
    VectorAssembler, StandardScaler
)
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

# 1) SparkSession başlat
spark = (SparkSession.builder
         .appName("DistancePrediction")
         .config("spark.executor.memory","8g")
         .config("spark.driver.memory","4g")
         .getOrCreate())

# 2) Veri yükle & temizleme
df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv")
      .dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng","Start_Time"])
)

# 3) Haversine ile distance_mi hesaplama
R = 3958.8
df2 = (df
       .withColumn("φ1", radians(col("Start_Lat")))
       .withColumn("φ2", radians(col("End_Lat")))
       .withColumn("Δφ", radians(col("End_Lat")-col("Start_Lat")))
       .withColumn("Δλ", radians(col("End_Lng")-col("Start_Lng")))
       .withColumn("a",
           sin(col("Δφ")/2)**2 +
           cos(col("φ1"))*cos(col("φ2"))*sin(col("Δλ")/2)**2
       )
       .withColumn("distance_mi", 2*R*atan2(sqrt(col("a")), sqrt(1-col("a"))))
       .drop("φ1","φ2","Δφ","Δλ","a")
)

# 4) Outlier temizleme (IQR)
q1, q3 = df2.approxQuantile("distance_mi", [0.25,0.75], 1e-4)
iqr    = q3 - q1
df_clean = df2.filter((col("distance_mi")>=q1-1.5*iqr) & (col("distance_mi")<=q3+1.5*iqr))

# 5) Özellik sütunları & zaman/hava ekle
num_feats = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)",
    "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
    "Precipitation(in)"
]
df_feat = (df_clean
           .withColumn("hour",    hour(col("Start_Time")))
           .withColumn("weekday", dayofweek(col("Start_Time")))
           # sayısal sütunlarda kayıp varsa at
           .na.drop(subset=num_feats)
           # kategori sütunu doldur
           .na.fill({"Weather_Condition":"Unknown"})
)

# 6) Weather_Condition → index + one‑hot
indexer = StringIndexer(
    inputCol="Weather_Condition",
    outputCol="wc_idx",
    handleInvalid="keep"
)
encoder = OneHotEncoder(
    inputCols=["wc_idx"],
    outputCols=["wc_vec"],
    dropLast=False
)

# 7) Vektör oluştur & ölçeklendir
assembler = VectorAssembler(
    inputCols=num_feats + ["hour","weekday","wc_vec"],
    outputCol="raw_features",
    handleInvalid="skip"
)
scaler = StandardScaler(
    inputCol="raw_features",
    outputCol="features",
    withMean=True, withStd=True
)

# 8) GBTRegressor
gbt = GBTRegressor(
    labelCol="distance_mi",
    featuresCol="features",
    maxIter=50,
    maxDepth=5,
    seed=42
)

# 9) Pipeline
pipeline = Pipeline(stages=[
    indexer, encoder,
    assembler, scaler,
    gbt
])

# 10) Train/Test split & eğit
train, test = df_feat.randomSplit([0.8,0.2], seed=42)
model = pipeline.fit(train)

# 11) Tahmin & değerlendirme
pred = model.transform(test)
evaluator = RegressionEvaluator(
    labelCol="distance_mi",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = evaluator.evaluate(pred)
r2   = evaluator.setMetricName("r2").evaluate(pred)

print(f"Test RMSE = {rmse:.3f}")
print(f"Test R2   = {r2:.3f}")

# 12) Kapat
spark.stop()


Test RMSE = 0.492
Test R2   = 0.025


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, radians, sin, cos, atan2, sqrt,
    hour, dayofweek
)
from pyspark.ml.feature import (
    Bucketizer,
    StringIndexer, OneHotEncoder,
    VectorAssembler, StandardScaler
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# 1) SparkSession
spark = (SparkSession.builder
         .appName("Distance_3Class_Classification")
         .config("spark.executor.memory","8g")
         .config("spark.driver.memory","4g")
         .getOrCreate())

# 2) Veri yükle & kritikleri at
df = (spark.read
      .option("header", True).option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv")
      .dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng","Start_Time"])
)

# 3) distance_mi hesapla
R = 3958.8
df = (df
      .withColumn("φ1", radians(col("Start_Lat")))
      .withColumn("φ2", radians(col("End_Lat")))
      .withColumn("Δφ", radians(col("End_Lat")-col("Start_Lat")))
      .withColumn("Δλ", radians(col("End_Lng")-col("Start_Lng")))
      .withColumn("a",
          sin(col("Δφ")/2)**2 +
          cos(col("φ1"))*cos(col("φ2"))*sin(col("Δλ")/2)**2
      )
      .withColumn("distance_mi",
          2*R*atan2(sqrt(col("a")), sqrt(1-col("a")))
      )
      .drop("φ1","φ2","Δφ","Δλ","a")
)

# 4) Aykırıları at (IQR)
q1, q3 = df.approxQuantile("distance_mi",[0.25,0.75],1e-4)
iqr    = q3 - q1
df = df.filter((col("distance_mi")>=q1-1.5*iqr) &
               (col("distance_mi")<=q3+1.5*iqr))

# 5) Sayısal özellikler ve time/ weather hazırlığı
num_feats = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)",
    "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
    "Precipitation(in)"
]
df = (df.withColumn("hour",    hour(col("Start_Time")))
        .withColumn("weekday", dayofweek(col("Start_Time")))
        # sayısal nan'leri at
        .na.drop(subset=num_feats)
        # weather boşsa Unknown
        .na.fill({"Weather_Condition":"Unknown"})
)

# 6) 3-class bucket eşikleri
# Örnek: 0.2 ve 1.0 mil
threshold_low  = 0.2
threshold_high = 1.0
splits = [float("-inf"), threshold_low, threshold_high, float("inf")]
bucketizer = Bucketizer(inputCol="distance_mi",
                        outputCol="distance_class",
                        splits=splits)

# 7) Weather_Condition → index + one‐hot
indexer = StringIndexer(inputCol="Weather_Condition",
                        outputCol="wc_idx",
                        handleInvalid="keep")
encoder = OneHotEncoder(inputCols=["wc_idx"],
                        outputCols=["wc_vec"],
                        dropLast=False)

# 8) Özellik vektörü: sayısal + time + weather vec
assembler = VectorAssembler(
    inputCols=num_feats + ["hour","weekday","wc_vec"],
    outputCol="raw_features",
    handleInvalid="skip"
)
scaler = StandardScaler(inputCol="raw_features",
                        outputCol="features",
                        withMean=True, withStd=True)

# 9) Sınıflandırıcı
rf = (RandomForestClassifier(labelCol="distance_class",
                             featuresCol="features")
      .setNumTrees(50)
      .setMaxDepth(8)
      .setSeed(42))

# 10) Pipeline ve eğitim
pipe = Pipeline(stages=[
    bucketizer,
    indexer, encoder,
    assembler, scaler,
    rf
])

train, test = df.randomSplit([0.8,0.2], seed=42)
model = pipe.fit(train)
pred  = model.transform(test)

# 11) Değerlendirme
evaluator = MulticlassClassificationEvaluator(
    labelCol="distance_class",
    predictionCol="prediction",
    metricName="f1"
)
acc = evaluator.setMetricName("accuracy").evaluate(pred)
f1  = evaluator.evaluate(pred)

print(f"3‑class Accuracy = {acc:.4f}")
print(f"3‑class F1       = {f1:.4f}")

spark.stop()


3‑class Accuracy = 0.5007
3‑class F1       = 0.5007


In [1]:
# -*- coding: utf-8 -*-
"""
3‑class distance classification without any grid search or CV.
Fixed hyperparameters only.
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, radians, sin, cos, atan2, sqrt,
    hour, dayofweek
)
from pyspark.ml.feature import (
    Bucketizer,
    StringIndexer, OneHotEncoder,
    VectorAssembler, StandardScaler
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# 1) SparkSession başlat
spark = (SparkSession.builder
         .appName("Distance_3Class_NoGrid")
         .config("spark.executor.memory","8g")
         .config("spark.driver.memory",  "4g")
         .getOrCreate())

# 2) Veriyi yükle ve temel temizleme
df = (spark.read
      .option("header", True).option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv")
      .dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng","Start_Time"])
)

# 3) distance_mi hesapla (Haversine)
R = 3958.8
df = (df
      .withColumn("φ1", radians(col("Start_Lat")))
      .withColumn("φ2", radians(col("End_Lat")))
      .withColumn("Δφ", radians(col("End_Lat") - col("Start_Lat")))
      .withColumn("Δλ", radians(col("End_Lng") - col("Start_Lng")))
      .withColumn("a",
          sin(col("Δφ")/2)**2 +
          cos(col("φ1")) * cos(col("φ2")) * sin(col("Δλ")/2)**2
      )
      .withColumn("distance_mi",
          2 * R * atan2(sqrt(col("a")), sqrt(1 - col("a")))
      )
      .drop("φ1","φ2","Δφ","Δλ","a")
)

# 4) Aykırı değerleri IQR ile at
q1, q3 = df.approxQuantile("distance_mi", [0.25,0.75], 1e-4)
iqr    = q3 - q1
df = df.filter((col("distance_mi") >= q1 - 1.5*iqr) &
               (col("distance_mi") <= q3 + 1.5*iqr))

# 5) Ek özellikler: zaman + coğrafi farklar
df = (df
      .withColumn("dLat",    col("End_Lat") - col("Start_Lat"))
      .withColumn("dLon",    col("End_Lng") - col("Start_Lng"))
      .withColumn("hour",    hour(col("Start_Time")))
      .withColumn("weekday", dayofweek(col("Start_Time")))
)

# 6) Eksik değer temizleme
num_feats = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)",
    "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
    "Precipitation(in)","dLat","dLon"
]
df = df.na.drop(subset=num_feats) \
       .na.fill({"Weather_Condition":"Unknown"})

# 7) Train/Test split
train, test = df.randomSplit([0.8, 0.2], seed=42)
train = train.repartition(200)

# 8) Quantile‐based 3‐class eşikler
qs = train.approxQuantile("distance_mi", [0.33, 0.66], 1e-4)
splits = [float("-inf"), qs[0], qs[1], float("inf")]
bucketizer = Bucketizer(inputCol="distance_mi",
                        outputCol="distance_class",
                        splits=splits)

# 9) Weather_Condition encode
indexer = StringIndexer(inputCol="Weather_Condition",
                        outputCol="wc_idx", handleInvalid="keep")
encoder = OneHotEncoder(inputCols=["wc_idx"],
                        outputCols=["wc_vec"], dropLast=False)

# 10) Özellik vektörü oluştur ve ölçeklendir
assembler = VectorAssembler(
    inputCols=num_feats + ["hour","weekday","wc_vec"],
    outputCol="raw_features", handleInvalid="skip"
)
scaler = StandardScaler(inputCol="raw_features",
                        outputCol="features",
                        withMean=True, withStd=True)

# 11) Sabit parametreli RandomForest
rf = RandomForestClassifier(labelCol="distance_class",
                            featuresCol="features",
                            numTrees=50,
                            maxDepth=8,
                            seed=42)

# 12) Pipeline ve eğitim
pipeline = Pipeline(stages=[
    bucketizer,
    indexer, encoder,
    assembler, scaler,
    rf
])
model = pipeline.fit(train)
pred  = model.transform(test)

# 13) Değerlendirme
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="accuracy")
evaluator_f1  = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="f1")

acc = evaluator_acc.evaluate(pred)
f1  = evaluator_f1.evaluate(pred)
print(f"3‑class Accuracy = {acc:.4f}")
print(f"3‑class F1       = {f1:.4f}")

# 14) SparkSession’ı kapat
spark.stop()


3‑class Accuracy = 0.7157
3‑class F1       = 0.7107


In [1]:
# -*- coding: utf-8 -*-
"""
3‑class distance classification with cell_id and saving the trained model.
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, radians, sin, cos, atan2, sqrt,
    hour, dayofweek
)
from pyspark.ml.feature import (
    Bucketizer,
    StringIndexer, OneHotEncoder,
    VectorAssembler, StandardScaler
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# 1) SparkSession başlat
spark = (SparkSession.builder
         .appName("Distance_3Class_WithCell_SaveModel")
         .config("spark.executor.memory","8g")
         .config("spark.driver.memory","4g")
         .getOrCreate())

# --- Veri hazırlık ---
df = (spark.read
      .option("header", True).option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv")
      .dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng","Start_Time"])
)

# distance_mi hesapla
R = 3958.8
df = (df
      .withColumn("φ1", radians(col("Start_Lat")))
      .withColumn("φ2", radians(col("End_Lat")))
      .withColumn("Δφ", radians(col("End_Lat")-col("Start_Lat")))
      .withColumn("Δλ", radians(col("End_Lng")-col("Start_Lng")))
      .withColumn("a", sin(col("Δφ")/2)**2 +
                       cos(col("φ1"))*cos(col("φ2"))*sin(col("Δλ")/2)**2)
      .withColumn("distance_mi", 2*R*atan2(sqrt(col("a")), sqrt(1-col("a"))))
      .drop("φ1","φ2","Δφ","Δλ","a")
)

# IQR ile outlier at
q1, q3 = df.approxQuantile("distance_mi",[0.25,0.75],1e-4)
iqr    = q3 - q1
df = df.filter((col("distance_mi")>=q1-1.5*iqr)&(col("distance_mi")<=q3+1.5*iqr))

# coğrafi farklar + zaman + hücre kimliği
min_lat = df.agg({"Start_Lat":"min"}).collect()[0][0]
min_lng = df.agg({"Start_Lng":"min"}).collect()[0][0]
grid_size = 0.045

df = (df
      .withColumn("dLat",     col("End_Lat")-col("Start_Lat"))
      .withColumn("dLon",     col("End_Lng")-col("Start_Lng"))
      .withColumn("hour",     hour(col("Start_Time")))
      .withColumn("weekday",  dayofweek(col("Start_Time")))
      .withColumn("lat_bin",  ((col("Start_Lat")-min_lat)/grid_size).cast("int"))
      .withColumn("lng_bin",  ((col("Start_Lng")-min_lng)/grid_size).cast("int"))
      .withColumn("cell_id",  col("lat_bin")*10000 + col("lng_bin"))
)

# sayısal NaN’leri at, weather doldur
num_feats = ["Temperature(F)","Wind_Chill(F)","Humidity(%)",
             "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
             "Precipitation(in)","dLat","dLon"]
df = df.na.drop(subset=num_feats).na.fill({"Weather_Condition":"Unknown"})

# train/test
train, test = df.randomSplit([0.8,0.2], seed=42)

# 3‑sınıf quantile eşikleri
qs = train.approxQuantile("distance_mi",[0.33,0.66],1e-4)
splits = [float("-inf"), qs[0], qs[1], float("inf")]
bucket = Bucketizer(inputCol="distance_mi",
                    outputCol="distance_class",
                    splits=splits)

# feature encoding
idx_wc    = StringIndexer(inputCol="Weather_Condition", outputCol="wc_idx", handleInvalid="keep")
enc_wc    = OneHotEncoder(inputCols=["wc_idx"], outputCols=["wc_vec"], dropLast=False)
idx_cell  = StringIndexer(inputCol="cell_id",           outputCol="cell_idx", handleInvalid="keep")
enc_cell  = OneHotEncoder(inputCols=["cell_idx"],       outputCols=["cell_vec"], dropLast=False)

# assembler & scaler
assembler = VectorAssembler(
    inputCols=num_feats + ["hour","weekday","wc_vec","cell_vec"],
    outputCol="raw_features", handleInvalid="skip"
)
scaler = StandardScaler(inputCol="raw_features",
                        outputCol="features",
                        withMean=True, withStd=True)

# RF sabit ayar
rf = RandomForestClassifier(labelCol="distance_class",
                            featuresCol="features",
                            numTrees=100,
                            maxDepth=10,
                            seed=42)

pipeline = Pipeline(stages=[
    bucket,
    idx_wc, enc_wc,
    idx_cell, enc_cell,
    assembler, scaler,
    rf
])

# model eğit
model = pipeline.fit(train)

# test & değerlendirme
pred  = model.transform(test)
e_acc = MulticlassClassificationEvaluator(labelCol="distance_class",
                                           predictionCol="prediction",
                                           metricName="accuracy")
e_f1  = MulticlassClassificationEvaluator(labelCol="distance_class",
                                           predictionCol="prediction",
                                           metricName="f1")
print("Accuracy =", e_acc.evaluate(pred))
print("F1       =", e_f1.evaluate(pred))

# --- Modeli kaydet ---
model.write().overwrite().save("models/distance_3class_rf_with_cell")

# SparkSession’ı kapat
spark.stop()


Accuracy = 0.6029788883819928
F1       = 0.5807768702230311


In [1]:
# -*- coding: utf-8 -*-
"""
3‑class distance classification without any grid search or CV.
Fixed hyperparameters only.
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, radians, sin, cos, atan2, sqrt,
    hour, dayofweek
)
from pyspark.ml.feature import (
    Bucketizer,
    StringIndexer, OneHotEncoder,
    VectorAssembler, StandardScaler
)
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# 1) SparkSession başlat
spark = (SparkSession.builder
         .appName("Distance_3Class_NoGrid")
         .config("spark.executor.memory","8g")
         .config("spark.driver.memory",  "4g")
         .getOrCreate())

# 2) Veriyi yükle ve temel temizleme
df = (spark.read
      .option("header", True).option("inferSchema", True)
      .csv("data/traffic_data/US_Accidents_March23.csv")
      .dropna(subset=["Start_Lat","Start_Lng","End_Lat","End_Lng","Start_Time"])
)

# 3) distance_mi hesapla (Haversine)
R = 3958.8
df = (df
      .withColumn("φ1", radians(col("Start_Lat")))
      .withColumn("φ2", radians(col("End_Lat")))
      .withColumn("Δφ", radians(col("End_Lat") - col("Start_Lat")))
      .withColumn("Δλ", radians(col("End_Lng") - col("Start_Lng")))
      .withColumn("a",
          sin(col("Δφ")/2)**2 +
          cos(col("φ1")) * cos(col("φ2")) * sin(col("Δλ")/2)**2
      )
      .withColumn("distance_mi",
          2 * R * atan2(sqrt(col("a")), sqrt(1 - col("a")))
      )
      .drop("φ1","φ2","Δφ","Δλ","a")
)

# 4) Aykırı değerleri IQR ile at
q1, q3 = df.approxQuantile("distance_mi", [0.25,0.75], 1e-4)
iqr    = q3 - q1
df = df.filter((col("distance_mi") >= q1 - 1.5*iqr) &
               (col("distance_mi") <= q3 + 1.5*iqr))

# 5) Ek özellikler: zaman + coğrafi farklar
df = (df
      .withColumn("dLat",    col("End_Lat") - col("Start_Lat"))
      .withColumn("dLon",    col("End_Lng") - col("Start_Lng"))
      .withColumn("hour",    hour(col("Start_Time")))
      .withColumn("weekday", dayofweek(col("Start_Time")))
)

# 6) Eksik değer temizleme
num_feats = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)",
    "Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
    "Precipitation(in)","dLat","dLon"
]
df = df.na.drop(subset=num_feats) \
       .na.fill({"Weather_Condition":"Unknown"})

# 7) Train/Test split
train, test = df.randomSplit([0.8, 0.2], seed=42)
train = train.repartition(200)

# 8) Quantile‐based 3‐class eşikler
qs = train.approxQuantile("distance_mi", [0.33, 0.66], 1e-4)
splits = [float("-inf"), qs[0], qs[1], float("inf")]
bucketizer = Bucketizer(inputCol="distance_mi",
                        outputCol="distance_class",
                        splits=splits)

# 9) Weather_Condition encode
indexer = StringIndexer(inputCol="Weather_Condition",
                        outputCol="wc_idx", handleInvalid="keep")
encoder = OneHotEncoder(inputCols=["wc_idx"],
                        outputCols=["wc_vec"], dropLast=False)

# 10) Özellik vektörü oluştur ve ölçeklendir
assembler = VectorAssembler(
    inputCols=num_feats + ["hour","weekday","wc_vec"],
    outputCol="raw_features", handleInvalid="skip"
)
scaler = StandardScaler(inputCol="raw_features",
                        outputCol="features",
                        withMean=True, withStd=True)

# 11) Sabit parametreli RandomForest
rf = RandomForestClassifier(labelCol="distance_class",
                            featuresCol="features",
                            numTrees=50,
                            maxDepth=8,
                            seed=42)

# 12) Pipeline ve eğitim
pipeline = Pipeline(stages=[
    bucketizer,
    indexer, encoder,
    assembler, scaler,
    rf
])
model = pipeline.fit(train)
pred  = model.transform(test)

# 13) Değerlendirme
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="accuracy")
evaluator_f1  = MulticlassClassificationEvaluator(
    labelCol="distance_class", predictionCol="prediction",
    metricName="f1")

acc = evaluator_acc.evaluate(pred)
f1  = evaluator_f1.evaluate(pred)
print(f"3‑class Accuracy = {acc:.4f}")
print(f"3‑class F1       = {f1:.4f}")

model.write().overwrite().save("models/distance_3class_rf_with_final_cell")
# 14) SparkSession’ı kapat
spark.stop()


3‑class Accuracy = 0.7243
3‑class F1       = 0.7217


In [1]:
import findspark
findspark.init()

from AccidentSeverityPredictor import AccidentSeverityPredictor

veri = [
    {
        "Start_Time": "2023-12-10 07:00:00",
        "End_Time": "2023-12-10 07:45:00",
        "Temperature(F)": 15.0,
        "Humidity(%)": 85.0,
        "Pressure(in)": 29.5,
        "Visibility(mi)": 2.0,
        "Wind_Speed(mph)": 20.0,
        "Precipitation(in)": 0.4,
        "Weather_Condition": "Snow",
        "Wind_Direction": "N",
        "Civil_Twilight": "Night",
        "Sunrise_Sunset": "Night",
        "State": "IL",
        "Junction": True,
        "Traffic_Signal": False,
        "Crossing": True,
        "City": "Chicago",
        "Street": "W Adams St",
        "Wind_Chill(F)": 10.0
    },
    {
        "Start_Time": "2024-03-20 18:30:00",
        "End_Time": "2024-03-20 18:40:00",
        "Temperature(F)": 85.0,
        "Humidity(%)": 30.0,
        "Pressure(in)": 30.2,
        "Visibility(mi)": 10.0,
        "Wind_Speed(mph)": 3.0,
        "Precipitation(in)": 0.0,
        "Weather_Condition": "Clear",
        "Wind_Direction": "SE",
        "Civil_Twilight": "Day",
        
        "Sunrise_Sunset": "Day",
        "State": "TX",
        "Junction": False,
        "Traffic_Signal": True,
        "Crossing": False,
        "City": "Houston",
        "Street": "Main St",
        "Wind_Chill(F)": 84.0
    },
    {
        "Start_Time": "2023-09-01 05:00:00",
        "End_Time": "2023-09-01 06:00:00",
        "Temperature(F)": 40.0,
        "Humidity(%)": 95.0,
        "Pressure(in)": 29.3,
        "Visibility(mi)": 1.0,
        "Wind_Speed(mph)": 15.0,
        "Precipitation(in)": 1.0,
        "Weather_Condition": "Heavy Rain",
        "Wind_Direction": "W",
        "Civil_Twilight": "Night",
        "Sunrise_Sunset": "Night",
        "State": "FL",
        "Junction": True,
        "Traffic_Signal": True,
        "Crossing": True,
        "City": "Miami",
        "Street": "Biscayne Blvd",
        "Wind_Chill(F)": 39.0
    }
]


predictor = AccidentSeverityPredictor("models/us_accidents_dt_final_last_full_spark")

predictor.predict_from_rows(    
    rows=veri,
    show_columns=["City_Cleaned", "Street_Cleaned", "prediction"],
    n=10
)

predictor.stop()


[DEBUG] CWD: c:\Users\aslay\Desktop\BİL 401\project
[DEBUG] Loading PipelineModel from: c:\Users\aslay\Desktop\BİL 401\project\models\us_accidents_dt_final_last_full_spark


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0) (MSI executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:789)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	... 32 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:181)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:789)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	... 32 more
